perl_lexer/lib.rs
1//! Context-aware Perl lexer with mode-based tokenization
2//!
3//! This crate provides a high-performance lexer for Perl that handles the inherently
4//! context-sensitive nature of the language. The lexer uses a mode-tracking system to
5//! correctly disambiguate ambiguous syntax like `/` (division vs. regex) and properly
6//! parse complex constructs like heredocs, quote-like operators, and nested delimiters.
7//!
8//! # Architecture
9//!
10//! The lexer is organized around several key concepts:
11//!
12//! - **Mode Tracking**: [`LexerMode`] tracks whether the parser expects a term or an operator,
13//! enabling correct disambiguation of context-sensitive tokens.
14//! - **Checkpointing**: [`LexerCheckpoint`] and [`Checkpointable`] support incremental parsing
15//! by allowing the lexer state to be saved and restored.
16//! - **Budget Limits**: Protection against pathological input with configurable size limits
17//! for regex patterns, heredoc bodies, and delimiter nesting depth.
18//! - **Position Tracking**: [`Position`] maintains line/column information for error reporting
19//! and LSP integration.
20//! - **Unicode Support**: Full Unicode identifier support following Perl 5.14+ semantics.
21//!
22//! # Usage
23//!
24//! ## Basic Tokenization
25//!
26//! ```rust
27//! use perl_lexer::{PerlLexer, TokenType};
28//!
29//! let mut lexer = PerlLexer::new("my $x = 42;");
30//! let tokens = lexer.collect_tokens();
31//!
32//! // First token is the keyword `my`
33//! assert!(matches!(&tokens[0].token_type, TokenType::Keyword(k) if &**k == "my"));
34//! // Tokens include variables, operators, literals, and EOF
35//! assert!(matches!(&tokens.last().map(|t| &t.token_type), Some(TokenType::EOF)));
36//! ```
37//!
38//! ## Context-Aware Parsing
39//!
40//! The lexer automatically tracks context to disambiguate operators:
41//!
42//! ```rust
43//! use perl_lexer::{PerlLexer, TokenType};
44//!
45//! // Division operator (after a term)
46//! let mut lexer = PerlLexer::new("42 / 2");
47//! // Regex operator (at start of expression)
48//! let mut lexer2 = PerlLexer::new("/pattern/");
49//! ```
50//!
51//! ## Checkpointing for Incremental Parsing
52//!
53//! ```rust,ignore
54//! use perl_lexer::{PerlLexer, Checkpointable};
55//!
56//! let mut lexer = PerlLexer::new("my $x = 1;");
57//! let checkpoint = lexer.checkpoint();
58//!
59//! // Parse some tokens
60//! let _ = lexer.next_token();
61//!
62//! // Restore to checkpoint
63//! lexer.restore(&checkpoint);
64//! ```
65//!
66//! ## Configuration Options
67//!
68//! ```rust
69//! use perl_lexer::{PerlLexer, LexerConfig};
70//!
71//! let config = LexerConfig {
72//! parse_interpolation: true, // Parse string interpolation
73//! track_positions: true, // Track line/column positions
74//! max_lookahead: 1024, // Maximum lookahead for disambiguation
75//! };
76//!
77//! let mut lexer = PerlLexer::with_config("my $x = 1;", config);
78//! ```
79//!
80//! # Context Sensitivity Examples
81//!
82//! Perl's grammar is highly context-sensitive. The lexer handles these cases:
83//!
84//! - **Division vs. Regex**: `/` is division after terms, regex at expression start
85//! - **Modulo vs. Hash Sigil**: `%` is modulo after terms, hash sigil at expression start
86//! - **Glob vs. Exponent**: `**` can be exponentiation or glob pattern start
87//! - **Defined-or vs. Regex**: `//` is defined-or after terms, regex at expression start
88//! - **Heredoc Markers**: `<<` can be left shift, here-doc, or numeric less-than-less-than
89//!
90//! # Budget Limits
91//!
92//! To prevent hangs on pathological input, the lexer enforces these limits:
93//!
94//! - **MAX_REGEX_BYTES**: 64KB maximum for regex patterns
95//! - **MAX_HEREDOC_BYTES**: 256KB maximum for heredoc bodies
96//! - **MAX_DELIM_NEST**: 128 levels maximum nesting depth for delimiters
97//! - **MAX_REGEX_PARSE_STEPS**: 32K maximum scan iterations for regex literals
98//!
99//! When limits are exceeded, the lexer emits an `UnknownRest` token preserving
100//! all previously parsed symbols, allowing continued analysis.
101//!
102//! # Integration with perl-parser
103//!
104//! The lexer is designed to work seamlessly with `perl_parser_core::Parser`.
105//! You rarely need to use the lexer directly -- the parser creates and manages
106//! a `PerlLexer` instance internally:
107//!
108//! ```rust,ignore
109//! use perl_parser_core::Parser;
110//!
111//! let code = r#"sub hello { print "Hello, world!\n"; }"#;
112//! let mut parser = Parser::new(code);
113//! let ast = parser.parse().expect("should parse");
114//! ```
115
116#![warn(clippy::all)]
117#![allow(
118 // Core allows for lexer code
119 clippy::too_many_lines,
120 clippy::module_name_repetitions,
121 clippy::cast_possible_truncation,
122 clippy::cast_sign_loss,
123 clippy::cast_possible_wrap,
124 clippy::cast_precision_loss,
125 clippy::must_use_candidate,
126 clippy::missing_errors_doc,
127 clippy::missing_panics_doc,
128
129 // Lexer-specific patterns that are fine
130 clippy::match_same_arms,
131 clippy::redundant_else,
132 clippy::unnecessary_wraps,
133 clippy::unused_self,
134 clippy::items_after_statements,
135 clippy::struct_excessive_bools,
136 clippy::uninlined_format_args
137)]
138
139use std::sync::{Arc, OnceLock};
140
141pub mod api;
142pub mod builtins;
143pub mod checkpoint;
144pub mod config;
145pub mod error;
146mod heredoc;
147pub mod keywords;
148pub mod limits;
149pub mod mode;
150mod quote_handler;
151pub mod token;
152pub mod tokenizer;
153mod unicode;
154
155pub use api::*;
156pub use checkpoint::{CheckpointCache, Checkpointable, LexerCheckpoint};
157pub use config::LexerConfig;
158pub use error::{LexerError, Result};
159pub use limits::MAX_REGEX_PARSE_STEPS;
160pub use mode::LexerMode;
161pub use perl_position_tracking::Position;
162pub use token::{StringPart, Token, TokenType};
163
164use unicode::{is_perl_identifier_continue, is_perl_identifier_start};
165
166use crate::heredoc::HeredocSpec;
167use crate::limits::{
168 HEREDOC_TIMEOUT_MS, MAX_DELIM_NEST, MAX_HEREDOC_BYTES, MAX_HEREDOC_DEPTH, MAX_REGEX_BYTES,
169};
170
171/// Context-aware Perl lexer that produces a token stream from source text.
172///
173/// The lexer tracks an internal [`LexerMode`] to disambiguate context-sensitive
174/// syntax (e.g., `/` as division vs. regex delimiter). Construct with
175/// [`PerlLexer::new`] and call [`PerlLexer::next_token`] or
176/// [`PerlLexer::collect_tokens`] to consume the stream.
177///
178/// # Examples
179///
180/// ```rust
181/// use perl_lexer::{PerlLexer, TokenType};
182///
183/// let mut lexer = PerlLexer::new("my $x = 42;");
184/// let tokens = lexer.collect_tokens();
185/// assert!(!tokens.is_empty());
186/// ```
187pub struct PerlLexer<'a> {
188 input: &'a str,
189 /// Cached input bytes for faster access
190 input_bytes: &'a [u8],
191 position: usize,
192 mode: LexerMode,
193 config: LexerConfig,
194 /// Stack for nested delimiters in s{}{} constructs
195 delimiter_stack: Vec<char>,
196 /// Track if we're inside prototype parens after 'sub'
197 in_prototype: bool,
198 /// Paren depth to track when we exit prototype
199 prototype_depth: usize,
200 /// Track if we just saw a 'sub' keyword (waiting for possible prototype)
201 after_sub: bool,
202 /// Track if we just saw a '->' operator (to suppress s/tr/y as substitution)
203 after_arrow: bool,
204 /// Depth of hash-subscript brace nesting.
205 /// When > 0, suppresses quote-op detection so `m`, `s`, `q*`, `tr`, `y`
206 /// are treated as bareword identifiers (hash keys) rather than regex operators.
207 /// Depth tracking means all positions inside `$h{...}` — including after commas
208 /// in hash slices like `@h{m, s}` — correctly suppress quote-op misidentification.
209 hash_brace_depth: usize,
210 /// Set to `true` immediately after emitting a complete `$var`, `@var`, or `%var`
211 /// token (not bare sigils used for dereference). Cleared by any operator,
212 /// punctuation, or keyword token. The `{` handler increments `hash_brace_depth`
213 /// only when this flag is set, ensuring only genuine hash/slice subscripts
214 /// (e.g. `$h{m}`, `@h{s, tr}`) suppress quote-op detection — not block-opening
215 /// braces after `sub foo`, `if (cond)`, `else`, `while (cond)`, etc.
216 after_var_subscript: bool,
217 /// Depth of open parentheses — used to distinguish `(1<<func())` (bitshift)
218 /// from `print $fh <<END` (heredoc at statement level, paren_depth == 0).
219 paren_depth: usize,
220 /// Current position with line/column tracking
221 #[allow(dead_code)]
222 current_pos: Position,
223 /// Track if we just skipped a newline (for __DATA__/__END__ detection)
224 after_newline: bool,
225 /// Queue of pending heredocs waiting for their bodies
226 pending_heredocs: Vec<HeredocSpec>,
227 /// Track the byte offset of the current line's start
228 line_start_offset: usize,
229 /// If true, emit `HeredocBody` tokens; otherwise just consume them.
230 emit_heredoc_body_tokens: bool,
231 /// Current quote operator being parsed
232 current_quote_op: Option<quote_handler::QuoteOperatorInfo>,
233 /// Track if EOF has been emitted to prevent infinite loops
234 eof_emitted: bool,
235 /// Start time for timeout protection
236 start_time: std::time::Instant,
237}
238
239impl<'a> PerlLexer<'a> {
240 /// Create a new lexer for the given input
241 pub fn new(input: &'a str) -> Self {
242 Self::with_config(input, LexerConfig::default())
243 }
244
245 /// Create a new lexer with custom configuration
246 pub fn with_config(input: &'a str, config: LexerConfig) -> Self {
247 Self {
248 input,
249 input_bytes: input.as_bytes(),
250 position: 0,
251 mode: LexerMode::ExpectTerm,
252 config,
253 delimiter_stack: Vec::new(),
254 in_prototype: false,
255 prototype_depth: 0,
256 after_sub: false,
257 after_arrow: false,
258 hash_brace_depth: 0,
259 after_var_subscript: false,
260 paren_depth: 0,
261 current_pos: Position::start(),
262 after_newline: true, // Start of file counts as after newline
263 pending_heredocs: Vec::new(),
264 line_start_offset: 0,
265 emit_heredoc_body_tokens: false,
266 current_quote_op: None,
267 eof_emitted: false,
268 start_time: std::time::Instant::now(),
269 }
270 }
271
272 /// Create a new lexer that emits `HeredocBody` tokens (for LSP folding)
273 pub fn with_body_tokens(input: &'a str) -> Self {
274 let mut lexer = Self::new(input);
275 lexer.emit_heredoc_body_tokens = true;
276 lexer
277 }
278
279 /// Normalize file start by skipping BOM if present
280 fn normalize_file_start(&mut self) {
281 // Skip UTF-8 BOM (EF BB BF) if at file start
282 if self.position == 0 && self.matches_bytes(&[0xEF, 0xBB, 0xBF]) {
283 self.position = 3;
284 self.line_start_offset = 3;
285 }
286 }
287
288 /// Set the lexer mode (for resetting state at statement boundaries)
289 pub fn set_mode(&mut self, mode: LexerMode) {
290 self.mode = mode;
291 }
292
293 /// Helper to check if remaining bytes on a line are only spaces/tabs
294 #[inline]
295 fn trailing_ws_only(bytes: &[u8], mut p: usize) -> bool {
296 while p < bytes.len() && bytes[p] != b'\n' && bytes[p] != b'\r' {
297 match bytes[p] {
298 b' ' | b'\t' => p += 1,
299 _ => return false,
300 }
301 }
302 true
303 }
304
305 /// Consume a newline sequence (CRLF or LF) and update state
306 #[inline]
307 fn consume_newline(&mut self) {
308 if self.position >= self.input.len() {
309 return;
310 }
311 match self.input_bytes[self.position] {
312 b'\r' => {
313 self.position += 1;
314 if self.position < self.input.len() && self.input_bytes[self.position] == b'\n' {
315 self.position += 1;
316 }
317 }
318 b'\n' => self.advance(),
319 _ => return, // not at a newline
320 }
321 self.after_newline = true;
322 self.line_start_offset = self.position;
323 }
324
325 /// Find the end of the current line, returning both raw end and visible end (without trailing CR)
326 #[inline]
327 fn find_line_end(bytes: &[u8], start: usize) -> (usize, usize) {
328 let mut end = start;
329 while end < bytes.len() && bytes[end] != b'\n' && bytes[end] != b'\r' {
330 end += 1;
331 }
332 let visible_end = end;
333 (end, visible_end)
334 }
335
336 #[inline]
337 fn parse_quoted_heredoc_delimiter(&mut self, quote: char, text: &mut String) -> Option<String> {
338 text.push(quote);
339 self.advance();
340
341 let mut delim = String::new();
342 while self.position < self.input.len() {
343 let Some(ch) = self.current_char() else {
344 break;
345 };
346
347 if ch == quote {
348 text.push(ch);
349 self.advance();
350 return Some(delim);
351 }
352
353 // Delimiter quoting cannot span a line. If we hit CR/LF before the
354 // closing quote, this is not a valid heredoc opener.
355 if ch == '\n' || ch == '\r' {
356 return None;
357 }
358
359 delim.push(ch);
360 text.push(ch);
361 self.advance();
362 }
363
364 // Unterminated quoted delimiter: degrade gracefully by treating this as
365 // not-a-heredoc so normal tokenization can continue.
366 None
367 }
368
369 /// Advance the lexer and return the next token.
370 ///
371 /// Returns `None` only after an `EOF` token has already been emitted.
372 /// The final meaningful call returns `Some(Token { token_type: TokenType::EOF, .. })`.
373 pub fn next_token(&mut self) -> Option<Token> {
374 // Normalize file start (BOM) once
375 if self.position == 0 {
376 self.normalize_file_start();
377 }
378
379 // Loop to avoid recursion when processing heredocs
380 loop {
381 // Handle format body parsing if we're in that mode
382 if matches!(self.mode, LexerMode::InFormatBody) {
383 return self.parse_format_body();
384 }
385
386 // Handle data section parsing if we're in that mode
387 if matches!(self.mode, LexerMode::InDataSection) {
388 return self.parse_data_body();
389 }
390
391 // Check if we're inside a heredoc body BEFORE skipping whitespace
392 let mut found_terminator = false;
393 if !self.pending_heredocs.is_empty() {
394 // Clone what we need to avoid holding a borrow
395 let (body_start, label, allow_indent) =
396 if let Some(spec) = self.pending_heredocs.first() {
397 if spec.body_start > 0
398 && self.position >= spec.body_start
399 && self.position < self.input.len()
400 {
401 (spec.body_start, spec.label.clone(), spec.allow_indent)
402 } else {
403 // Not in a heredoc body yet or at EOF
404 (0, empty_arc(), false)
405 }
406 } else {
407 (0, empty_arc(), false)
408 };
409
410 if body_start > 0 {
411 // We're inside a heredoc body - scan for the terminator
412
413 // Scan line by line looking for the terminator
414 while self.position < self.input.len() {
415 // Timeout protection (Issue #443)
416 if self.start_time.elapsed().as_millis() > HEREDOC_TIMEOUT_MS as u128 {
417 self.pending_heredocs.remove(0);
418 self.position = self.input.len();
419 return Some(Token {
420 token_type: TokenType::Error(Arc::from("Heredoc parsing timeout")),
421 text: Arc::from(&self.input[body_start..]),
422 start: body_start,
423 end: self.input.len(),
424 });
425 }
426
427 // Budget cap for huge bodies - optimized check
428 if self.position - body_start > MAX_HEREDOC_BYTES {
429 // Remove the pending heredoc to avoid infinite loop
430 self.pending_heredocs.remove(0);
431 self.position = self.input.len();
432 return Some(Token {
433 token_type: TokenType::UnknownRest,
434 text: Arc::from(&self.input[body_start..]),
435 start: body_start,
436 end: self.input.len(),
437 });
438 }
439
440 // Skip to start of next line if not at line start
441 // Exception: if we're at body_start exactly, we're at the heredoc body start
442 if !self.after_newline && self.position != body_start {
443 while self.position < self.input.len()
444 && self.input_bytes[self.position] != b'\n'
445 && self.input_bytes[self.position] != b'\r'
446 {
447 self.advance();
448 }
449 self.consume_newline();
450 continue;
451 }
452
453 // We're at line start - check if this line is the terminator
454 let line_start = self.position;
455 let (line_end, line_visible_end) =
456 Self::find_line_end(self.input_bytes, self.position);
457 let line = &self.input[line_start..line_visible_end];
458 // Strip trailing spaces/tabs (Perl allows them)
459 let trimmed_end = line.trim_end_matches([' ', '\t']);
460
461 // Check if this line is the terminator
462 let is_terminator = if allow_indent {
463 // Allow any leading spaces/tabs before the label
464 let mut p = 0;
465 while p < trimmed_end.len() {
466 let b = trimmed_end.as_bytes()[p];
467 if b == b' ' || b == b'\t' {
468 p += 1;
469 } else {
470 break;
471 }
472 }
473 trimmed_end[p..] == *label
474 } else {
475 // Must start at column 0 (no leading whitespace)
476 // The terminator is just the label (already trimmed trailing whitespace)
477 trimmed_end == &*label
478 };
479
480 if is_terminator {
481 // Found the terminator!
482 self.pending_heredocs.remove(0);
483 found_terminator = true;
484
485 // Consume past the terminator line
486 self.position = line_end;
487 self.consume_newline();
488
489 // Set body_start for the next pending heredoc (if any)
490 if let Some(next) = self.pending_heredocs.first_mut()
491 && next.body_start == 0
492 {
493 next.body_start = self.position;
494 }
495
496 // Only emit HeredocBody if requested (for folding)
497 if self.emit_heredoc_body_tokens {
498 return Some(Token {
499 token_type: TokenType::HeredocBody(empty_arc()),
500 text: empty_arc(),
501 start: body_start,
502 end: line_start,
503 });
504 }
505 // Otherwise, continue the outer loop to get the next real token (avoiding recursion)
506 break; // Break inner while loop, continue outer loop
507 }
508
509 // Not the terminator, continue to next line
510 self.position = line_end;
511 self.consume_newline();
512 }
513
514 // If we didn't find a terminator, we reached EOF - emit error token
515 if !found_terminator {
516 // Remove the pending heredoc to avoid infinite loop
517 self.pending_heredocs.remove(0);
518 self.position = self.input.len();
519 return Some(Token {
520 token_type: TokenType::UnknownRest,
521 text: Arc::from(&self.input[body_start..]),
522 start: body_start,
523 end: self.input.len(),
524 });
525 }
526 }
527
528 // If we found a terminator, continue outer loop to get next token
529 if found_terminator {
530 continue; // Continue outer loop to get next token
531 }
532 }
533
534 self.skip_whitespace_and_comments()?;
535
536 // Check again if we're now in a heredoc body (might have been set during skip_whitespace)
537 if !self.pending_heredocs.is_empty()
538 && let Some(spec) = self.pending_heredocs.first()
539 && spec.body_start > 0
540 && self.position >= spec.body_start
541 && self.position < self.input.len()
542 {
543 continue; // Go back to top of loop to process heredoc
544 }
545
546 // If we reach EOF with pending heredocs, clear them and emit EOF
547 if self.position >= self.input.len() && !self.pending_heredocs.is_empty() {
548 self.pending_heredocs.clear();
549 }
550
551 if self.position >= self.input.len() {
552 if self.eof_emitted {
553 return None; // Stop the stream
554 }
555 self.eof_emitted = true;
556 return Some(Token {
557 token_type: TokenType::EOF,
558 text: empty_arc(),
559 start: self.position,
560 end: self.position,
561 });
562 }
563
564 let start = self.position;
565
566 // Check for special tokens first
567 if let Some(token) = self.try_heredoc() {
568 return Some(token);
569 }
570
571 if let Some(token) = self.try_string() {
572 return Some(token);
573 }
574
575 if let Some(token) = self.try_variable() {
576 return Some(token);
577 }
578
579 if let Some(token) = self.try_number() {
580 return Some(token);
581 }
582
583 if let Some(token) = self.try_vstring() {
584 return Some(token);
585 }
586
587 if let Some(token) = self.try_identifier_or_keyword() {
588 return Some(token);
589 }
590
591 // If we're expecting a delimiter for a quote operator, only try delimiter
592 if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
593 if let Some(token) = self.try_delimiter() {
594 return Some(token);
595 }
596 // Do NOT fall through to try_operator / try_punct / etc.
597 // Clear state first so we don't spin
598 self.mode = LexerMode::ExpectOperator;
599 self.current_quote_op = None;
600 continue;
601 }
602
603 if let Some(token) = self.try_operator() {
604 return Some(token);
605 }
606
607 if let Some(token) = self.try_delimiter() {
608 return Some(token);
609 }
610
611 // If nothing else matches, return an error token
612 let ch = self.current_char()?;
613 self.advance();
614
615 // Optimize error token creation - avoid expensive formatting in hot path
616 let text = if ch.is_ascii() {
617 // Fast path for ASCII characters
618 Arc::from(&self.input[start..self.position])
619 } else {
620 // Unicode path without intermediate heap allocation
621 let mut buf = [0_u8; 4];
622 Arc::from(ch.encode_utf8(&mut buf))
623 };
624
625 return Some(Token {
626 token_type: TokenType::Error(Arc::from("Unexpected character")),
627 text,
628 start,
629 end: self.position,
630 });
631 } // End of loop
632 }
633
634 /// Budget guard to prevent infinite loops and timeouts (Issue #422)
635 ///
636 /// **Purpose**: Protect against pathological input that could cause:
637 /// - Infinite loops in regex/heredoc parsing
638 /// - Excessive memory consumption
639 /// - LSP server hangs
640 ///
641 /// **Limits**:
642 /// - `MAX_REGEX_BYTES` (64KB): Maximum bytes in a single regex literal
643 /// - `MAX_DELIM_NEST` (128): Maximum delimiter nesting depth
644 ///
645 /// **Graceful Degradation**:
646 /// - Budget exceeded → emit `UnknownRest` token
647 /// - Jump to EOF to prevent further parsing of problematic region
648 /// - LSP client can emit soft diagnostic about truncation
649 /// - All previously parsed symbols remain valid
650 ///
651 /// **Performance**:
652 /// - Fast path: inlined subtraction + comparison (~1-2 CPU cycles)
653 /// - Slow path: Only triggered on pathological input
654 /// - Amortized cost: O(1) per token
655 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
656 #[inline(always)]
657 fn budget_guard(&mut self, start: usize, depth: usize) -> Option<Token> {
658 // Fast path: most calls won't hit limits
659 let bytes_consumed = self.position - start;
660 if bytes_consumed <= MAX_REGEX_BYTES && depth <= MAX_DELIM_NEST {
661 return None;
662 }
663
664 // Slow path: budget exceeded - graceful degradation
665 #[cfg(debug_assertions)]
666 {
667 tracing::debug!(
668 bytes_consumed,
669 depth,
670 position = self.position,
671 "Lexer budget exceeded"
672 );
673 }
674
675 self.position = self.input.len();
676 Some(Token {
677 token_type: TokenType::UnknownRest,
678 text: Arc::from(""),
679 start,
680 end: self.position,
681 })
682 }
683
684 /// Peek at the next token without consuming it.
685 ///
686 /// Saves and restores the full lexer state so the next call to
687 /// [`next_token`](Self::next_token) returns the same token.
688 pub fn peek_token(&mut self) -> Option<Token> {
689 let saved_pos = self.position;
690 let saved_mode = self.mode;
691 let saved_delimiter_stack = self.delimiter_stack.clone();
692 let saved_prototype = self.in_prototype;
693 let saved_depth = self.prototype_depth;
694 let saved_after_sub = self.after_sub;
695 let saved_after_arrow = self.after_arrow;
696 let saved_hash_brace_depth = self.hash_brace_depth;
697 let saved_after_var_subscript = self.after_var_subscript;
698 let saved_paren_depth = self.paren_depth;
699 let saved_current_pos = self.current_pos;
700 let saved_after_newline = self.after_newline;
701 let saved_pending_heredocs = self.pending_heredocs.clone();
702 let saved_line_start_offset = self.line_start_offset;
703 let saved_current_quote_op = self.current_quote_op.clone();
704 let saved_eof_emitted = self.eof_emitted;
705 let saved_start_time = self.start_time;
706
707 let token = self.next_token();
708
709 self.position = saved_pos;
710 self.mode = saved_mode;
711 self.delimiter_stack = saved_delimiter_stack;
712 self.in_prototype = saved_prototype;
713 self.prototype_depth = saved_depth;
714 self.after_sub = saved_after_sub;
715 self.after_arrow = saved_after_arrow;
716 self.hash_brace_depth = saved_hash_brace_depth;
717 self.after_var_subscript = saved_after_var_subscript;
718 self.paren_depth = saved_paren_depth;
719 self.current_pos = saved_current_pos;
720 self.after_newline = saved_after_newline;
721 self.pending_heredocs = saved_pending_heredocs;
722 self.line_start_offset = saved_line_start_offset;
723 self.current_quote_op = saved_current_quote_op;
724 self.eof_emitted = saved_eof_emitted;
725 self.start_time = saved_start_time;
726
727 token
728 }
729
730 /// Consume all remaining tokens and return them as a vector.
731 ///
732 /// The returned vector always ends with an `EOF` token.
733 pub fn collect_tokens(&mut self) -> Vec<Token> {
734 let mut tokens = Vec::new();
735 while let Some(token) = self.next_token() {
736 if token.token_type == TokenType::EOF {
737 tokens.push(token);
738 break;
739 }
740 tokens.push(token);
741 }
742 tokens
743 }
744
745 /// Reset the lexer to the beginning of the input.
746 ///
747 /// Clears all internal state (mode, delimiter stack, heredoc queue, etc.)
748 /// so the lexer can re-tokenize the same source from scratch.
749 pub fn reset(&mut self) {
750 self.position = 0;
751 self.mode = LexerMode::ExpectTerm;
752 self.delimiter_stack.clear();
753 self.in_prototype = false;
754 self.prototype_depth = 0;
755 self.after_sub = false;
756 self.after_arrow = false;
757 self.hash_brace_depth = 0;
758 self.after_var_subscript = false;
759 self.paren_depth = 0;
760 self.current_pos = Position::start();
761 self.after_newline = true;
762 self.pending_heredocs.clear();
763 self.line_start_offset = 0;
764 self.current_quote_op = None;
765 self.eof_emitted = false;
766 self.start_time = std::time::Instant::now();
767 }
768
769 /// Switch the lexer into format-body parsing mode.
770 ///
771 /// In this mode the lexer consumes input verbatim until it encounters a
772 /// line containing only `.` (the Perl format terminator).
773 pub fn enter_format_mode(&mut self) {
774 self.mode = LexerMode::InFormatBody;
775 }
776
777 // Internal helper methods
778
779 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
780 #[inline(always)]
781 fn byte_at(bytes: &[u8], index: usize) -> u8 {
782 debug_assert!(index < bytes.len());
783 match bytes.get(index) {
784 Some(&byte) => byte,
785 None => 0,
786 }
787 }
788
789 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
790 #[inline(always)]
791 fn current_char(&self) -> Option<char> {
792 if self.position < self.input_bytes.len() {
793 // For ASCII, direct access is safe
794 let byte = Self::byte_at(self.input_bytes, self.position);
795 if byte < 128 {
796 Some(byte as char)
797 } else {
798 // For non-ASCII, fall back to proper UTF-8 parsing
799 self.input.get(self.position..).and_then(|s| s.chars().next())
800 }
801 } else {
802 None
803 }
804 }
805
806 #[inline(always)]
807 fn peek_char(&self, offset: usize) -> Option<char> {
808 if offset > self.config.max_lookahead {
809 return None;
810 }
811
812 let pos = self.position.checked_add(offset)?;
813 if pos < self.input_bytes.len() {
814 // For ASCII, direct access is safe
815 let byte = Self::byte_at(self.input_bytes, pos);
816 if byte < 128 {
817 Some(byte as char)
818 } else {
819 // For non-ASCII, use chars iterator
820 self.input.get(self.position..).and_then(|s| s.chars().nth(offset))
821 }
822 } else {
823 None
824 }
825 }
826
827 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
828 #[inline(always)]
829 fn advance(&mut self) {
830 if self.position < self.input_bytes.len() {
831 let byte = Self::byte_at(self.input_bytes, self.position);
832 if byte < 128 {
833 // ASCII fast path
834 self.position += 1;
835 } else if let Some(ch) = self.input.get(self.position..).and_then(|s| s.chars().next())
836 {
837 self.position += ch.len_utf8();
838 }
839 }
840 }
841
842 /// General-purpose balanced-segment consumer (no quote-boundary recovery).
843 ///
844 /// For use inside double-quoted string interpolation where the outer `"` must
845 /// act as a recovery boundary, use [`consume_balanced_segment_in_string`] instead.
846 #[allow(dead_code)]
847 #[inline]
848 fn consume_balanced_segment(&mut self, open: char, close: char) -> Option<usize> {
849 if self.current_char() != Some(open) {
850 return None;
851 }
852
853 let mut depth = 1usize;
854 self.advance();
855 while let Some(ch) = self.current_char() {
856 match ch {
857 '\\' => {
858 self.advance();
859 if self.current_char().is_some() {
860 self.advance();
861 }
862 }
863 c if c == open => {
864 depth += 1;
865 self.advance();
866 }
867 c if c == close => {
868 self.advance();
869 depth -= 1;
870 if depth == 0 {
871 return Some(self.position);
872 }
873 }
874 _ => self.advance(),
875 }
876 }
877
878 None
879 }
880
881 #[inline]
882 fn consume_balanced_segment_in_string(
883 &mut self,
884 open: char,
885 close: char,
886 terminator: char,
887 ) -> Option<usize> {
888 if self.current_char() != Some(open) {
889 return None;
890 }
891
892 let mut depth = 1usize;
893 self.advance();
894 while let Some(ch) = self.current_char() {
895 match ch {
896 '\\' => {
897 self.advance();
898 if self.current_char().is_some() {
899 self.advance();
900 }
901 }
902 c if c == terminator => {
903 // Local recovery for interpolation tails in quoted strings:
904 // stop at the closing quote so the outer string parser can
905 // still terminate this token cleanly.
906 return None;
907 }
908 c if c == open => {
909 depth += 1;
910 self.advance();
911 }
912 c if c == close => {
913 self.advance();
914 depth -= 1;
915 if depth == 0 {
916 return Some(self.position);
917 }
918 }
919 _ => self.advance(),
920 }
921 }
922
923 None
924 }
925
926 /// Fast byte-level check for ASCII characters
927 #[inline]
928 fn peek_byte(&self, offset: usize) -> Option<u8> {
929 if offset > self.config.max_lookahead {
930 return None;
931 }
932
933 let pos = self.position.checked_add(offset)?;
934 if pos < self.input_bytes.len() { Some(self.input_bytes[pos]) } else { None }
935 }
936
937 /// Check if the next bytes match a pattern (ASCII only)
938 #[inline]
939 fn matches_bytes(&self, pattern: &[u8]) -> bool {
940 let Some(end_offset) = pattern.len().checked_sub(1) else {
941 return true;
942 };
943
944 if end_offset > self.config.max_lookahead {
945 return false;
946 }
947
948 let Some(end) = self.position.checked_add(pattern.len()) else {
949 return false;
950 };
951
952 if end <= self.input_bytes.len() {
953 &self.input_bytes[self.position..end] == pattern
954 } else {
955 false
956 }
957 }
958
959 #[inline]
960 fn skip_whitespace_and_comments(&mut self) -> Option<()> {
961 // Don't reset after_newline if we're at the start of a line
962 if self.position > 0 && self.position != self.line_start_offset {
963 self.after_newline = false;
964 }
965
966 while self.position < self.input_bytes.len() {
967 let byte = Self::byte_at(self.input_bytes, self.position);
968 match byte {
969 // Fast path for ASCII whitespace - batch process
970 b' ' => {
971 // Batch skip spaces for better cache efficiency
972 let start = self.position;
973 while self.position < self.input_bytes.len()
974 && Self::byte_at(self.input_bytes, self.position) == b' '
975 {
976 self.position += 1;
977 }
978 // Continue outer loop if we processed any spaces
979 if self.position > start {
980 // Loop naturally continues to next iteration
981 }
982 }
983 b'\t' | 0x0B | 0x0C => {
984 // Batch skip horizontal tab, vertical tab, and form feed.
985 // Perl treats these as whitespace separators.
986 let start = self.position;
987 while self.position < self.input_bytes.len()
988 && matches!(
989 Self::byte_at(self.input_bytes, self.position),
990 b'\t' | 0x0B | 0x0C
991 )
992 {
993 self.position += 1;
994 }
995 if self.position > start {
996 // Loop naturally continues to next iteration
997 }
998 }
999 b'\r' | b'\n' => {
1000 self.consume_newline();
1001
1002 // Set body_start for the FIRST pending heredoc that needs it (FIFO)
1003 // Only check if we have pending heredocs to avoid unnecessary work
1004 if !self.pending_heredocs.is_empty() {
1005 for spec in &mut self.pending_heredocs {
1006 if spec.body_start == 0 {
1007 spec.body_start = self.position;
1008 break; // Only set for the first unresolved heredoc
1009 }
1010 }
1011 }
1012 }
1013 b'#' => {
1014 // In ExpectDelimiter mode, '#' is a delimiter, not a comment
1015 if matches!(self.mode, LexerMode::ExpectDelimiter) {
1016 break;
1017 }
1018
1019 // Skip line comment using memchr for fast newline search
1020 self.position += 1; // Skip # directly
1021
1022 // Use memchr2 to find CR/LF line endings quickly (supports LF, CRLF, and CR)
1023 if let Some(newline_offset) =
1024 memchr::memchr2(b'\n', b'\r', &self.input_bytes[self.position..])
1025 {
1026 self.position += newline_offset;
1027 } else {
1028 // No newline found, skip to end
1029 self.position = self.input_bytes.len();
1030 }
1031 }
1032 b'=' if self.position == 0
1033 || (self.position > 0
1034 && matches!(self.input_bytes[self.position - 1], b'\n' | b'\r')) =>
1035 {
1036 // Check if this starts a POD section (=pod, =head, =over, etc.)
1037 // Use byte-safe checks — avoid slicing &str at arbitrary byte positions
1038 let remaining = &self.input_bytes[self.position..];
1039 if remaining.starts_with(b"=pod")
1040 || remaining.starts_with(b"=head")
1041 || remaining.starts_with(b"=over")
1042 || remaining.starts_with(b"=item")
1043 || remaining.starts_with(b"=back")
1044 || remaining.starts_with(b"=begin")
1045 || remaining.starts_with(b"=end")
1046 || remaining.starts_with(b"=for")
1047 || remaining.starts_with(b"=encoding")
1048 {
1049 // Scan forward for \n=cut (end of POD block)
1050 let search_start = self.position;
1051 let mut found_cut = false;
1052 let bytes = self.input_bytes;
1053 let mut i = search_start;
1054 while i < bytes.len() {
1055 // Look for =cut at the start of a line
1056 if (i == 0 || matches!(bytes[i - 1], b'\n' | b'\r'))
1057 && bytes[i..].starts_with(b"=cut")
1058 {
1059 i += 4; // Skip "=cut"
1060 // Skip rest of the =cut line
1061 while i < bytes.len() && bytes[i] != b'\n' && bytes[i] != b'\r' {
1062 i += 1;
1063 }
1064 // Consume one line ending sequence if present
1065 if i < bytes.len() && bytes[i] == b'\r' {
1066 i += 1;
1067 if i < bytes.len() && bytes[i] == b'\n' {
1068 i += 1;
1069 }
1070 } else if i < bytes.len() && bytes[i] == b'\n' {
1071 i += 1;
1072 }
1073 self.position = i;
1074 found_cut = true;
1075 break;
1076 }
1077 i += 1;
1078 }
1079 if !found_cut {
1080 // POD extends to end of file
1081 self.position = bytes.len();
1082 }
1083 continue;
1084 }
1085 // Not a POD directive - regular '=' token
1086 break;
1087 }
1088 _ => {
1089 // For non-ASCII whitespace, use char check only when needed
1090 if byte >= 128
1091 && let Some(ch) = self.current_char()
1092 && ch.is_whitespace()
1093 {
1094 self.advance();
1095 continue;
1096 }
1097 break;
1098 }
1099 }
1100 }
1101 Some(())
1102 }
1103
1104 fn try_heredoc(&mut self) -> Option<Token> {
1105 // `<<` is the left-shift operator, not a heredoc, when we are inside
1106 // a parenthesized expression and have just finished a term.
1107 // E.g. `(1<<index(...))` — the `1` sets ExpectOperator and paren_depth > 0,
1108 // so `<<index` must be the bitshift operator, not a heredoc start.
1109 //
1110 // We must NOT fire the guard at statement level (paren_depth == 0) because
1111 // `print $fh <<END` is valid Perl: `$fh` sets ExpectOperator but `<<END`
1112 // is a heredoc. The depth check distinguishes the two cases.
1113 if self.mode == LexerMode::ExpectOperator && self.paren_depth > 0 {
1114 return None;
1115 }
1116
1117 // Check for heredoc start
1118 if self.peek_byte(0) != Some(b'<') || self.peek_byte(1) != Some(b'<') {
1119 return None;
1120 }
1121
1122 let start = self.position;
1123 let mut text = String::from("<<");
1124 self.position += 2; // Skip <<
1125
1126 // Check for indented heredoc (~)
1127 let allow_indent = if self.current_char() == Some('~') {
1128 text.push('~');
1129 self.advance();
1130 true
1131 } else {
1132 false
1133 };
1134
1135 // Skip whitespace
1136 while let Some(ch) = self.current_char() {
1137 if ch == ' ' || ch == '\t' {
1138 text.push(ch);
1139 self.advance();
1140 } else {
1141 break;
1142 }
1143 }
1144
1145 // Optional backslash disables interpolation, treat like single-quoted label
1146 let backslashed = if self.current_char() == Some('\\') {
1147 text.push('\\');
1148 self.advance();
1149 true
1150 } else {
1151 false
1152 };
1153
1154 // Parse delimiter
1155 let delimiter = if self.position < self.input.len() {
1156 match self.current_char() {
1157 Some('"') if !backslashed => self.parse_quoted_heredoc_delimiter('"', &mut text)?,
1158 Some('\'') if !backslashed => {
1159 self.parse_quoted_heredoc_delimiter('\'', &mut text)?
1160 }
1161 Some('`') if !backslashed => self.parse_quoted_heredoc_delimiter('`', &mut text)?,
1162 Some(c) if is_perl_identifier_start(c) => {
1163 // Bare word delimiter
1164 let mut delim = String::new();
1165 while self.position < self.input.len() {
1166 if let Some(c) = self.current_char() {
1167 if is_perl_identifier_continue(c) {
1168 delim.push(c);
1169 text.push(c);
1170 self.advance();
1171 } else {
1172 break;
1173 }
1174 } else {
1175 break;
1176 }
1177 }
1178 delim
1179 }
1180 _ => {
1181 // Not a valid heredoc delimiter - reset position and return None
1182 // This allows << to be parsed as bitshift operator (e.g., 1 << 2)
1183 self.position = start;
1184 return None;
1185 }
1186 }
1187 } else {
1188 // No delimiter found - reset position and return None
1189 self.position = start;
1190 return None;
1191 };
1192
1193 // For now, return a placeholder token
1194 // The actual heredoc body would be parsed later when we encounter it
1195 self.mode = LexerMode::ExpectOperator;
1196
1197 // Recursion depth limit (Issue #443)
1198 if self.pending_heredocs.len() >= MAX_HEREDOC_DEPTH {
1199 return Some(Token {
1200 token_type: TokenType::Error(Arc::from("Heredoc nesting too deep")),
1201 text: Arc::from(text),
1202 start,
1203 end: self.position,
1204 });
1205 }
1206
1207 // Queue the heredoc spec with its label
1208 self.pending_heredocs.push(HeredocSpec {
1209 label: Arc::from(delimiter.as_str()),
1210 body_start: 0, // Will be set when we see the newline after this line
1211 allow_indent,
1212 });
1213
1214 Some(Token {
1215 token_type: TokenType::HeredocStart,
1216 text: Arc::from(text),
1217 start,
1218 end: self.position,
1219 })
1220 }
1221
1222 fn try_string(&mut self) -> Option<Token> {
1223 let start = self.position;
1224 let quote = self.current_char()?;
1225
1226 match quote {
1227 '"' => self.parse_double_quoted_string(start),
1228 '\'' => self.parse_single_quoted_string(start),
1229 '`' => self.parse_backtick_string(start),
1230 'q' if self.peek_char(1) == Some('{') => self.parse_q_string(start),
1231 _ => None,
1232 }
1233 }
1234
1235 #[inline]
1236 fn try_number(&mut self) -> Option<Token> {
1237 let start = self.position;
1238
1239 // Fast byte check for digits - optimized bounds checking
1240 let bytes = self.input_bytes;
1241 if self.position >= bytes.len() || !Self::byte_at(bytes, self.position).is_ascii_digit() {
1242 return None;
1243 }
1244
1245 // Check for hex (0x), binary (0b), or octal (0o) prefixes
1246 let mut pos = self.position;
1247 if Self::byte_at(bytes, pos) == b'0' && pos + 1 < bytes.len() {
1248 let prefix_byte = bytes[pos + 1];
1249 if prefix_byte == b'x' || prefix_byte == b'X' {
1250 // Hexadecimal: 0x[0-9a-fA-F_]+
1251 pos += 2; // consume '0x'
1252 let digit_start = pos;
1253 let mut saw_digit = false;
1254 while pos < bytes.len() && (bytes[pos].is_ascii_hexdigit() || bytes[pos] == b'_') {
1255 saw_digit |= bytes[pos].is_ascii_hexdigit();
1256 pos += 1;
1257 }
1258 if pos > digit_start && saw_digit {
1259 self.position = pos;
1260 let text = &self.input[start..self.position];
1261 self.mode = LexerMode::ExpectOperator;
1262 return Some(Token {
1263 token_type: TokenType::Number(Arc::from(text)),
1264 text: Arc::from(text),
1265 start,
1266 end: self.position,
1267 });
1268 }
1269 // No hex digits after 0x - fall through to parse '0' as decimal
1270 } else if prefix_byte == b'b' || prefix_byte == b'B' {
1271 // Binary: 0b[01_]+
1272 pos += 2; // consume '0b'
1273 let digit_start = pos;
1274 let mut saw_digit = false;
1275 while pos < bytes.len()
1276 && (bytes[pos] == b'0' || bytes[pos] == b'1' || bytes[pos] == b'_')
1277 {
1278 saw_digit |= bytes[pos] == b'0' || bytes[pos] == b'1';
1279 pos += 1;
1280 }
1281 if pos > digit_start && saw_digit {
1282 self.position = pos;
1283 let text = &self.input[start..self.position];
1284 self.mode = LexerMode::ExpectOperator;
1285 return Some(Token {
1286 token_type: TokenType::Number(Arc::from(text)),
1287 text: Arc::from(text),
1288 start,
1289 end: self.position,
1290 });
1291 }
1292 // No binary digits after 0b - fall through to parse '0' as decimal
1293 } else if prefix_byte == b'o' || prefix_byte == b'O' {
1294 // Octal (explicit): 0o[0-7_]+
1295 pos += 2; // consume '0o'
1296 let digit_start = pos;
1297 let mut saw_digit = false;
1298 while pos < bytes.len()
1299 && ((bytes[pos] >= b'0' && bytes[pos] <= b'7') || bytes[pos] == b'_')
1300 {
1301 saw_digit |= (b'0'..=b'7').contains(&bytes[pos]);
1302 pos += 1;
1303 }
1304 if pos > digit_start && saw_digit {
1305 self.position = pos;
1306 let text = &self.input[start..self.position];
1307 self.mode = LexerMode::ExpectOperator;
1308 return Some(Token {
1309 token_type: TokenType::Number(Arc::from(text)),
1310 text: Arc::from(text),
1311 start,
1312 end: self.position,
1313 });
1314 }
1315 // No octal digits after 0o - fall through to parse '0' as decimal
1316 }
1317 }
1318
1319 // Consume initial digits - unrolled for better performance
1320 pos = self.position;
1321 while pos < bytes.len() {
1322 let byte = Self::byte_at(bytes, pos);
1323 if byte.is_ascii_digit() || byte == b'_' {
1324 pos += 1;
1325 } else {
1326 break;
1327 }
1328 }
1329 self.position = pos;
1330
1331 // Check for decimal point - optimized with single bounds check
1332 if pos < bytes.len() && Self::byte_at(bytes, pos) == b'.' {
1333 // Peek ahead to see what follows the dot
1334 let has_following_digit = pos + 1 < bytes.len() && bytes[pos + 1].is_ascii_digit();
1335
1336 // Optimized dot consumption logic
1337 let should_consume_dot = has_following_digit || {
1338 pos + 1 >= bytes.len() || {
1339 // Use bitwise operations for faster character classification
1340 let next_byte = bytes[pos + 1];
1341 // Whitespace, delimiters, operators - optimized check
1342 next_byte <= b' '
1343 || matches!(
1344 next_byte,
1345 b';' | b','
1346 | b')'
1347 | b'}'
1348 | b']'
1349 | b'+'
1350 | b'-'
1351 | b'*'
1352 | b'/'
1353 | b'%'
1354 | b'='
1355 | b'<'
1356 | b'>'
1357 | b'!'
1358 | b'&'
1359 | b'|'
1360 | b'^'
1361 | b'~'
1362 | b'e'
1363 | b'E'
1364 )
1365 }
1366 };
1367
1368 if should_consume_dot {
1369 pos += 1; // consume the dot
1370 // Consume fractional digits - batch processing
1371 while pos < bytes.len() && (bytes[pos].is_ascii_digit() || bytes[pos] == b'_') {
1372 pos += 1;
1373 }
1374 self.position = pos;
1375 }
1376 }
1377
1378 // Check for exponent - optimized
1379 if pos < bytes.len() && (bytes[pos] == b'e' || bytes[pos] == b'E') {
1380 let exp_start = pos;
1381 pos += 1; // consume 'e' or 'E'
1382
1383 // Check for optional sign
1384 if pos < bytes.len() && (bytes[pos] == b'+' || bytes[pos] == b'-') {
1385 pos += 1;
1386 }
1387
1388 // Must have at least one digit after exponent (underscores allowed between digits)
1389 let mut saw_digit = false;
1390 while pos < bytes.len() {
1391 let byte = bytes[pos];
1392 if byte.is_ascii_digit() {
1393 saw_digit = true;
1394 pos += 1;
1395 } else if byte == b'_' {
1396 pos += 1;
1397 } else {
1398 break;
1399 }
1400 }
1401
1402 // If no digits after exponent, backtrack
1403 if !saw_digit {
1404 pos = exp_start;
1405 }
1406
1407 self.position = pos;
1408 }
1409
1410 // Avoid string slicing for common number cases - use Arc::from directly on slice
1411 let text = &self.input[start..self.position];
1412 self.mode = LexerMode::ExpectOperator;
1413
1414 Some(Token {
1415 token_type: TokenType::Number(Arc::from(text)),
1416 text: Arc::from(text),
1417 start,
1418 end: self.position,
1419 })
1420 }
1421
1422 fn parse_decimal_number(&mut self, start: usize) -> Option<Token> {
1423 // We're at the dot, consume it
1424 self.advance();
1425
1426 // Parse the fractional part
1427 while self.position < self.input_bytes.len() {
1428 let byte = self.input_bytes[self.position];
1429 match byte {
1430 b'0'..=b'9' | b'_' => self.position += 1,
1431 b'e' | b'E' => {
1432 // Handle scientific notation.
1433 // Save the position of 'e'/'E' so we can backtrack here if
1434 // no digits follow the exponent marker (with or without sign).
1435 let e_pos = self.position;
1436 self.advance();
1437 if self.position < self.input_bytes.len() {
1438 let next = self.input_bytes[self.position];
1439 if next == b'+' || next == b'-' {
1440 self.advance();
1441 }
1442 }
1443 // Parse exponent digits (underscores allowed between digits)
1444 let exponent_start = self.position;
1445 let mut saw_digit = false;
1446 while self.position < self.input_bytes.len() {
1447 let byte = self.input_bytes[self.position];
1448 if byte.is_ascii_digit() {
1449 saw_digit = true;
1450 self.position += 1;
1451 } else if byte == b'_' {
1452 self.position += 1;
1453 } else {
1454 break;
1455 }
1456 }
1457
1458 // No digits after exponent marker — backtrack to just before
1459 // 'e'/'E' so the caller sees it as a separate token.
1460 // Using e_pos (not exponent_start-1) avoids including 'e' in
1461 // the number slice when a sign character was consumed.
1462 if !saw_digit {
1463 let _ = exponent_start; // mark as intentionally unused
1464 self.position = e_pos;
1465 }
1466 break;
1467 }
1468 _ => break,
1469 }
1470 }
1471
1472 let text = &self.input[start..self.position];
1473 self.mode = LexerMode::ExpectOperator;
1474
1475 Some(Token {
1476 token_type: TokenType::Number(Arc::from(text)),
1477 text: Arc::from(text),
1478 start,
1479 end: self.position,
1480 })
1481 }
1482
1483 fn try_variable(&mut self) -> Option<Token> {
1484 let start = self.position;
1485 let sigil = self.current_char()?;
1486
1487 match sigil {
1488 '$' | '@' | '%' | '*' => {
1489 // In ExpectOperator mode, treat % and * as operators rather than sigils
1490 if self.mode == LexerMode::ExpectOperator && matches!(sigil, '*' | '%') {
1491 return None;
1492 }
1493 self.advance();
1494
1495 // Special case: After ->, sigils followed by { or [ should be tokenized separately
1496 // This is for postfix dereference like ->@*, ->%{}, ->@[]
1497 // We need to be careful with Unicode - check if we have enough bytes and valid char boundaries
1498 let check_arrow = self.position >= 3
1499 && self.position.saturating_sub(1) <= self.input.len()
1500 && self.input.is_char_boundary(self.position.saturating_sub(3))
1501 && self.input.is_char_boundary(self.position.saturating_sub(1));
1502
1503 if check_arrow
1504 && {
1505 let saved = self.position;
1506 self.position -= 3;
1507 let arrow = self.matches_bytes(b"->");
1508 self.position = saved;
1509 arrow
1510 }
1511 && matches!(self.current_char(), Some('{' | '[' | '*'))
1512 {
1513 // Just return the sigil
1514 let text = &self.input[start..self.position];
1515 self.mode = LexerMode::ExpectOperator;
1516
1517 return Some(Token {
1518 token_type: TokenType::Identifier(Arc::from(text)),
1519 text: Arc::from(text),
1520 start,
1521 end: self.position,
1522 });
1523 }
1524
1525 // Check for $# (array length operator)
1526 if sigil == '$' && self.current_char() == Some('#') {
1527 self.advance(); // consume #
1528 // Now parse the array name
1529 while let Some(ch) = self.current_char() {
1530 if is_perl_identifier_continue(ch) {
1531 self.advance();
1532 } else if ch == ':' && self.peek_char(1) == Some(':') {
1533 // Package-qualified array name
1534 self.advance();
1535 self.advance();
1536 } else {
1537 break;
1538 }
1539 }
1540
1541 let text = &self.input[start..self.position];
1542 self.mode = LexerMode::ExpectOperator;
1543 // $#foo is a complete variable token; a following `{` is a subscript.
1544 self.after_var_subscript = true;
1545
1546 return Some(Token {
1547 token_type: TokenType::Identifier(Arc::from(text)),
1548 text: Arc::from(text),
1549 start,
1550 end: self.position,
1551 });
1552 }
1553
1554 // Check for special cases like ${^MATCH} or ${::{foo}} or *{$glob}
1555 if self.current_char() == Some('{') {
1556 // Peek ahead to decide if we should consume the brace
1557 let next_char = self.peek_char(1);
1558
1559 // Check if this is a dereference like @{$ref} or @{[...]}
1560 // If the next char suggests dereference, don't consume the brace.
1561 // For @ and % sigils, identifiers inside braces are also derefs
1562 // (e.g. @{Foo::Bar::baz} or %{Some::Hash}).
1563 let is_deref = sigil != '*'
1564 && (matches!(
1565 next_char,
1566 Some('$' | '@' | '%' | '*' | '&' | '[' | ' ' | '\t' | '\n' | '\r',)
1567 ) || (matches!(sigil, '@' | '%')
1568 && next_char.is_some_and(is_perl_identifier_start)));
1569 if is_deref {
1570 // This is a dereference, don't consume the brace
1571 let text = &self.input[start..self.position];
1572 self.mode = LexerMode::ExpectOperator;
1573 // A standalone sigil token before `{` starts a dereference
1574 // sequence (e.g. `${$ref}` / `@{$aref}` / `%{$href}` / `&{$cref}`).
1575 // Mark it as subscript-capable so `{` increments brace depth
1576 // and the closing `}` can enable chained `{...}` subscripts.
1577 // (Broader form than master's `$|@|%` filter — `*` is already
1578 // excluded by the `is_deref` guard above and `&` deref also
1579 // benefits from chained-subscript handling.)
1580 self.after_var_subscript = true;
1581
1582 return Some(Token {
1583 token_type: TokenType::Identifier(Arc::from(text)),
1584 text: Arc::from(text),
1585 start,
1586 end: self.position,
1587 });
1588 }
1589
1590 self.advance(); // consume {
1591
1592 // Handle special variables with caret
1593 if self.current_char() == Some('^') {
1594 self.advance(); // consume ^
1595 // Parse the special variable name
1596 while let Some(ch) = self.current_char() {
1597 if ch == '}' {
1598 self.advance(); // consume }
1599 break;
1600 } else if is_perl_identifier_continue(ch) {
1601 self.advance();
1602 } else {
1603 break;
1604 }
1605 }
1606 }
1607 // Handle stash access like $::{foo}
1608 else if self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1609 self.advance(); // consume first :
1610 self.advance(); // consume second :
1611 // Skip optional { and }
1612 if self.current_char() == Some('{') {
1613 self.advance();
1614 }
1615 // Parse the name
1616 while let Some(ch) = self.current_char() {
1617 if ch == '}' {
1618 self.advance();
1619 if self.current_char() == Some('}') {
1620 self.advance(); // consume closing } of ${...}
1621 }
1622 break;
1623 } else if is_perl_identifier_continue(ch) {
1624 self.advance();
1625 } else {
1626 break;
1627 }
1628 }
1629 }
1630 // Regular braced variable like ${foo} or glob like *{$glob}
1631 else {
1632 // Check if this is a dereference like ${$ref} or @{$ref} or @{[...]}
1633 // If the next char is a sigil or other expression starter, we should stop here and let the parser handle it
1634 // EXCEPT for globs - *{$glob} should be parsed as one token
1635 // Also check for empty braces or EOF - in these cases we should split the tokens
1636 if sigil != '*'
1637 && (matches!(
1638 self.current_char(),
1639 Some(
1640 '$' | '@'
1641 | '%'
1642 | '*'
1643 | '&'
1644 | '['
1645 | ' '
1646 | '\t'
1647 | '\n'
1648 | '\r'
1649 | '}'
1650 )
1651 ) || self.current_char().is_none())
1652 {
1653 // This is a dereference or empty/invalid brace, backtrack
1654 self.position = start + 1; // Just past the sigil
1655 let text = &self.input[start..self.position];
1656 self.mode = LexerMode::ExpectOperator;
1657 // Same as above: sigil-only token means a dereference opener.
1658 self.after_var_subscript = true;
1659
1660 return Some(Token {
1661 token_type: TokenType::Identifier(Arc::from(text)),
1662 text: Arc::from(text),
1663 start,
1664 end: self.position,
1665 });
1666 }
1667
1668 // For glob access, we need to consume everything inside braces
1669 if sigil == '*' {
1670 let mut brace_depth: usize = 1;
1671 while let Some(ch) = self.current_char() {
1672 if ch == '{' {
1673 brace_depth += 1;
1674 } else if ch == '}' {
1675 brace_depth = brace_depth.saturating_sub(1);
1676 if brace_depth == 0 {
1677 self.advance(); // consume final }
1678 break;
1679 }
1680 }
1681 self.advance();
1682 }
1683 } else {
1684 // Regular variable
1685 while let Some(ch) = self.current_char() {
1686 if ch == '}' {
1687 self.advance(); // consume }
1688 break;
1689 } else if is_perl_identifier_continue(ch) {
1690 self.advance();
1691 } else {
1692 break;
1693 }
1694 }
1695 }
1696 }
1697 }
1698 // Parse regular variable name
1699 else if let Some(ch) = self.current_char() {
1700 if is_perl_identifier_start(ch) {
1701 while let Some(ch) = self.current_char() {
1702 if is_perl_identifier_continue(ch) {
1703 self.advance();
1704 } else {
1705 break;
1706 }
1707 }
1708 // Handle package-qualified segments like Foo::bar
1709 while self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1710 self.advance();
1711 self.advance();
1712 while let Some(ch) = self.current_char() {
1713 if is_perl_identifier_continue(ch) {
1714 self.advance();
1715 } else {
1716 break;
1717 }
1718 }
1719 }
1720 }
1721 // Handle $^Letter (e.g. $^W, $^O, $^X) and bare $^ (format_top_name)
1722 // Not inside prototypes where ^ is a literal prototype char
1723 else if sigil == '$' && ch == '^' && !self.in_prototype {
1724 self.advance(); // consume ^
1725 // $^Letter: consume the single uppercase letter
1726 if let Some(letter) = self.current_char()
1727 && letter.is_ascii_uppercase()
1728 {
1729 self.advance();
1730 }
1731 // bare $^ (no uppercase letter follows): format_top_name — stop here
1732 }
1733 // Handle special punctuation variables
1734 // Not inside prototypes where ; and , are literal prototype chars
1735 else if sigil == '$'
1736 && !self.in_prototype
1737 && matches!(
1738 ch,
1739 '?' | '!'
1740 | '@'
1741 | '&'
1742 | '`'
1743 | '\''
1744 | '.'
1745 | '/'
1746 | '\\'
1747 | '|'
1748 | '+'
1749 | '-'
1750 | '['
1751 | ']'
1752 | '$'
1753 | '~'
1754 | '='
1755 | '%'
1756 | ','
1757 | '"'
1758 | ';'
1759 | '>'
1760 | '<'
1761 | ')'
1762 | '(' // $( = real group ID of this process
1763 )
1764 {
1765 self.advance(); // consume the special character
1766 }
1767 // $$ is the PID special variable, but only when it is not immediately
1768 // followed by an identifier-start character. $$var is scalar dereference
1769 // of $var, so keep the second $ for the next token.
1770 else if sigil == '$' && ch == '$' {
1771 if !self.peek_char(1).is_some_and(is_perl_identifier_start) {
1772 self.advance(); // consume the second $ for bare $$ PID
1773 }
1774 }
1775 // Handle special array/hash punctuation variables
1776 else if (sigil == '@' || sigil == '%') && matches!(ch, '+' | '-') {
1777 self.advance(); // consume the + or -
1778 }
1779 }
1780
1781 let text = &self.input[start..self.position];
1782 self.mode = LexerMode::ExpectOperator;
1783 // A complete $foo, @foo, %foo token can be followed by a hash/slice
1784 // subscript `{`. Set the flag so the `{` handler knows to increment
1785 // hash_brace_depth. Glob tokens (*foo) are excluded: they don't take
1786 // hash subscripts in the same way.
1787 self.after_var_subscript = matches!(sigil, '$' | '@' | '%');
1788
1789 Some(Token {
1790 token_type: TokenType::Identifier(Arc::from(text)),
1791 text: Arc::from(text),
1792 start,
1793 end: self.position,
1794 })
1795 }
1796 _ => None,
1797 }
1798 }
1799
1800 /// Return the next non-space char and the char immediately following it (without consuming).
1801 /// Used to detect quote-operator delimiters while distinguishing `=>` (fat-arrow autoquote)
1802 /// from `=` used as a plain delimiter.
1803 fn peek_nonspace_and_following(&self) -> (Option<char>, Option<char>) {
1804 let mut i = self.position;
1805 while i < self.input.len() {
1806 let c = match self.input.get(i..).and_then(|s| s.chars().next()) {
1807 Some(c) => c,
1808 None => return (None, None),
1809 };
1810 if c.is_whitespace() {
1811 i += c.len_utf8();
1812 continue;
1813 }
1814 // Found non-space at position i; peek the next char after it
1815 let j = i + c.len_utf8();
1816 let following = self.input.get(j..).and_then(|s| s.chars().next());
1817 return (Some(c), following);
1818 }
1819 (None, None)
1820 }
1821
1822 /// Is `c` a valid quote-like delimiter? (non-alnum, including paired)
1823 fn is_quote_delim(c: char) -> bool {
1824 // Perl allows any non-alphanumeric, non-whitespace character as delimiter,
1825 // including control characters (e.g. s\x07pattern\x07replacement\x07).
1826 !c.is_ascii_alphanumeric() && !c.is_whitespace()
1827 }
1828
1829 /// Try to parse a v-string (version string) like `v5.26.0` or `v5.10`.
1830 ///
1831 /// A v-string starts with `v` followed by one or more digits, then optionally
1832 /// `.` followed by digits, repeated. The `v` prefix distinguishes these from
1833 /// normal identifiers. Examples: `v5.26.0`, `v5.10`, `v1.2.3.4`.
1834 #[inline]
1835 fn try_vstring(&mut self) -> Option<Token> {
1836 let start = self.position;
1837 let bytes = self.input_bytes;
1838
1839 // Must start with 'v' followed by at least one digit
1840 if start >= bytes.len() || bytes[start] != b'v' {
1841 return None;
1842 }
1843
1844 let next_pos = start + 1;
1845 if next_pos >= bytes.len() || !bytes[next_pos].is_ascii_digit() {
1846 return None;
1847 }
1848
1849 // We have `v` followed by a digit — scan the rest of the v-string.
1850 // Pattern: v DIGITS (.DIGITS)*
1851 let mut pos = next_pos;
1852
1853 // Consume leading digits
1854 while pos < bytes.len() && bytes[pos].is_ascii_digit() {
1855 pos += 1;
1856 }
1857
1858 // Consume optional `.DIGITS` segments (require at least one digit after dot)
1859 while pos < bytes.len() && bytes[pos] == b'.' {
1860 let dot_pos = pos;
1861 pos += 1; // skip '.'
1862
1863 if pos >= bytes.len() || !bytes[pos].is_ascii_digit() {
1864 // Dot not followed by digit — not part of the v-string
1865 pos = dot_pos;
1866 break;
1867 }
1868
1869 // Consume digits after the dot
1870 while pos < bytes.len() && bytes[pos].is_ascii_digit() {
1871 pos += 1;
1872 }
1873 }
1874
1875 // Make sure the v-string isn't followed by identifier-continuation characters
1876 // (e.g. `v5x` should remain an identifier, not a v-string `v5` + `x`)
1877 if pos < bytes.len() {
1878 let next_byte = bytes[pos];
1879 if next_byte == b'_' || next_byte.is_ascii_alphabetic() {
1880 return None;
1881 }
1882 // Also check for non-ASCII identifier continuations
1883 if next_byte >= 128
1884 && let Some(ch) = self.input.get(pos..).and_then(|s| s.chars().next())
1885 && is_perl_identifier_continue(ch)
1886 {
1887 return None;
1888 }
1889 }
1890
1891 // `v5` (no dots) is a valid Perl v-string meaning chr(5).
1892 let text = &self.input[start..pos];
1893
1894 self.position = pos;
1895 self.mode = LexerMode::ExpectOperator;
1896
1897 Some(Token {
1898 token_type: TokenType::Version(Arc::from(text)),
1899 text: Arc::from(text),
1900 start,
1901 end: self.position,
1902 })
1903 }
1904
1905 #[inline]
1906 fn try_identifier_or_keyword(&mut self) -> Option<Token> {
1907 let start = self.position;
1908 let ch = self.current_char()?;
1909 let bytes = self.input_bytes;
1910 let len = bytes.len();
1911
1912 if is_perl_identifier_start(ch) {
1913 // Special case: substitution/transliteration with single-quote delimiter
1914 // The single quote is considered an identifier continuation, so we need to
1915 // detect these operators before consuming it as part of an identifier.
1916 if !self.after_arrow
1917 && self.hash_brace_depth == 0
1918 && ch == 's'
1919 && self.peek_char(1) == Some('\'')
1920 {
1921 self.advance(); // consume 's'
1922 return self.parse_substitution(start);
1923 } else if !self.after_arrow
1924 && self.hash_brace_depth == 0
1925 && ch == 'y'
1926 && self.peek_char(1) == Some('\'')
1927 {
1928 self.advance(); // consume 'y'
1929 return self.parse_transliteration(start);
1930 } else if !self.after_arrow
1931 && self.hash_brace_depth == 0
1932 && ch == 't'
1933 && self.peek_char(1) == Some('r')
1934 && self.peek_char(2) == Some('\'')
1935 {
1936 self.advance(); // consume 't'
1937 self.advance(); // consume 'r'
1938 return self.parse_transliteration(start);
1939 }
1940
1941 // Fast ASCII path for identifier continuation.
1942 while self.position < len {
1943 let byte = bytes[self.position];
1944 if byte == b'\'' && is_quote_op_word_prefix(&bytes[start..self.position]) {
1945 // Keep apostrophe for quote-operator parsing in cases like q'...'.
1946 break;
1947 }
1948
1949 if byte.is_ascii_alphanumeric() || byte == b'_' || byte == b'\'' {
1950 self.position += 1;
1951 continue;
1952 }
1953
1954 if byte < 128 {
1955 break;
1956 }
1957
1958 if let Some(ch) = self.current_char()
1959 && is_perl_identifier_continue(ch)
1960 {
1961 self.advance();
1962 continue;
1963 }
1964 break;
1965 }
1966 // Handle package-qualified identifiers like Foo::bar.
1967 while self.config.max_lookahead >= 1
1968 && self.position + 1 < len
1969 && bytes[self.position] == b':'
1970 && bytes[self.position + 1] == b':'
1971 {
1972 self.position += 2; // consume '::'
1973
1974 // consume following identifier segment if present
1975 let Some(ch) = self.current_char() else {
1976 break;
1977 };
1978 if !is_perl_identifier_start(ch) {
1979 break;
1980 }
1981 self.advance();
1982 while self.position < len {
1983 let byte = bytes[self.position];
1984 if byte.is_ascii_alphanumeric() || byte == b'_' || byte == b'\'' {
1985 self.position += 1;
1986 continue;
1987 }
1988 if byte < 128 {
1989 break;
1990 }
1991 if let Some(ch) = self.current_char()
1992 && is_perl_identifier_continue(ch)
1993 {
1994 self.advance();
1995 continue;
1996 }
1997 break;
1998 }
1999 }
2000
2001 let text = &self.input[start..self.position];
2002
2003 // Check for __DATA__ and __END__ markers using exact match
2004 // Only recognize these in code channel, not inside data/format sections or heredocs
2005 let in_code_channel =
2006 !matches!(self.mode, LexerMode::InDataSection | LexerMode::InFormatBody)
2007 && self.pending_heredocs.is_empty();
2008
2009 let marker = if in_code_channel {
2010 if text == "__DATA__" {
2011 Some("__DATA__")
2012 } else if text == "__END__" {
2013 Some("__END__")
2014 } else {
2015 None
2016 }
2017 } else {
2018 None
2019 };
2020
2021 if let Some(marker_text) = marker {
2022 // These must be at the beginning of a line
2023 // Use the after_newline flag to determine if we're at line start
2024 if self.after_newline {
2025 // Check if rest of line is only whitespace
2026 // Only treat as data marker if line has no trailing junk
2027 if Self::trailing_ws_only(self.input_bytes, self.position) {
2028 // Consume the rest of the line (the marker line)
2029 while self.position < self.input.len()
2030 && self.input_bytes[self.position] != b'\n'
2031 && self.input_bytes[self.position] != b'\r'
2032 {
2033 self.advance();
2034 }
2035 self.consume_newline();
2036
2037 // Switch to data section mode
2038 self.mode = LexerMode::InDataSection;
2039
2040 return Some(Token {
2041 token_type: TokenType::DataMarker(Arc::from(marker_text)),
2042 text: Arc::from(marker_text),
2043 start,
2044 end: self.position,
2045 });
2046 }
2047 }
2048 }
2049
2050 // Check for substitution/transliteration operators
2051 // Skip if after '->' -- these are method names, not operators.
2052 #[allow(clippy::collapsible_if)]
2053 if !self.after_arrow && self.hash_brace_depth == 0 && matches!(text, "s" | "tr" | "y") {
2054 let immediate = self.current_char();
2055 let (candidate, char_after_next, has_whitespace) =
2056 if immediate.is_some_and(|c| c.is_whitespace()) {
2057 let (nc, ca) = self.peek_nonspace_and_following();
2058 (nc, ca, true)
2059 } else {
2060 let following = immediate.and_then(|c| {
2061 let j = self.position + c.len_utf8();
2062 self.input.get(j..).and_then(|s| s.chars().next())
2063 });
2064 (immediate, following, false)
2065 };
2066
2067 if let Some(next) = candidate {
2068 // `s => 1` should remain a fat-arrow hash key, not quote op.
2069 let is_fat_arrow = next == '=' && char_after_next == Some('>');
2070 let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
2071 let is_quote_char = matches!(next, '\'' | '"') && text != "s";
2072 let transliteration_allows_whitespace = text == "tr" || text == "y";
2073 let substitution_disallows_whitespace = text == "s" && has_whitespace;
2074 let is_valid_delim = Self::is_quote_delim(next)
2075 && !is_fat_arrow
2076 && !substitution_disallows_whitespace
2077 && (!has_whitespace
2078 || is_paired_delim
2079 || is_quote_char
2080 || transliteration_allows_whitespace);
2081
2082 if is_valid_delim {
2083 match text {
2084 "s" => return self.parse_substitution(start),
2085 "tr" | "y" => return self.parse_transliteration(start),
2086 unexpected => {
2087 return Some(Token {
2088 token_type: TokenType::Error(Arc::from(format!(
2089 "Unexpected substitution operator '{}': expected 's', 'tr', or 'y' at position {}",
2090 unexpected, start
2091 ))),
2092 text: Arc::from(unexpected),
2093 start,
2094 end: self.position,
2095 });
2096 }
2097 }
2098 }
2099 }
2100 }
2101
2102 let token_type = if is_keyword_fast(text) {
2103 // Check for special keywords that affect lexer mode
2104 match text {
2105 "if" | "unless" | "while" | "until" | "for" | "foreach" | "grep" | "map"
2106 | "sort" | "split" | "and" | "or" | "xor" | "not"
2107 // These keywords introduce an expression, so a following `/` is a
2108 // regex, not division. `return /re/`, `die /re/`, `warn /re/`,
2109 // `do /file/`, and `eval /re/` are all valid Perl.
2110 | "return" | "die" | "warn" | "do" | "eval" => {
2111 self.mode = LexerMode::ExpectTerm;
2112 }
2113 "sub" => {
2114 self.after_sub = true;
2115 self.mode = LexerMode::ExpectTerm;
2116 }
2117 // Quote operators expect a delimiter next.
2118 // Skip if after '->' -- these are method names, not operators.
2119 // Skip inside hash subscript braces (hash_brace_depth > 0) — all
2120 // positions inside `$h{...}` or `@h{...}` treat quote-op names as
2121 // bareword keys, including after commas in slices like `@h{m, s}`.
2122 op if !self.after_arrow
2123 && self.hash_brace_depth == 0
2124 && quote_handler::is_quote_operator(op) =>
2125 {
2126 // Perl allows whitespace between a quote-like operator and its delimiter,
2127 // but ONLY for paired delimiters (s { ... } { ... }g).
2128 // For non-paired delimiters (s/foo/bar/, s,foo,bar,), the delimiter
2129 // must be immediately adjacent — otherwise `s $foo` would wrongly
2130 // treat `$` as a delimiter instead of being a bareword `s` followed
2131 // by a scalar variable.
2132 //
2133 // Strategy:
2134 // 1. Check the immediately-adjacent char first (no whitespace skip).
2135 // If it is a valid delimiter → any non-alnum, non-whitespace char.
2136 // 2. If the adjacent char is whitespace, peek past it.
2137 // Only accept PAIRED delimiters ({, [, (, <) in that case.
2138 let immediate = self.current_char();
2139 let (candidate, char_after_next, has_whitespace) =
2140 if immediate.is_some_and(|c| c.is_whitespace()) {
2141 // There is whitespace — peek past it
2142 let (nc, ca) = self.peek_nonspace_and_following();
2143 (nc, ca, true)
2144 } else {
2145 // No whitespace — use immediate char
2146 let following = immediate.and_then(|c| {
2147 let j = self.position + c.len_utf8();
2148 self.input.get(j..).and_then(|s| s.chars().next())
2149 });
2150 (immediate, following, false)
2151 };
2152
2153 if let Some(next) = candidate {
2154 // Fat-arrow autoquoting: `s => value` — `=` followed by `>` is '=>',
2155 // not a valid substitution delimiter. Treat as identifier.
2156 let is_fat_arrow = next == '=' && char_after_next == Some('>');
2157
2158 // When whitespace precedes the delimiter, only unambiguous
2159 // delimiters are accepted:
2160 // - Paired delimiters ({, [, (, <) are always safe.
2161 // - ' and " are safe for all operators EXCEPT `s` — `-s 'filename'`
2162 // is a valid file-size filetest and must not be treated as a
2163 // substitution start. All other operators (qw, q, qq, qr, qx, m,
2164 // tr, y) have no corresponding file-test operator.
2165 // - Non-paired, non-quote chars ($, @, ,, etc.) remain rejected.
2166 let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
2167 let is_quote_char = matches!(next, '\'' | '"') && op != "s";
2168 let is_valid_delim = Self::is_quote_delim(next)
2169 && !is_fat_arrow
2170 && (!has_whitespace || is_paired_delim || is_quote_char);
2171
2172 if is_valid_delim {
2173 self.mode = LexerMode::ExpectDelimiter;
2174 self.current_quote_op = Some(quote_handler::QuoteOperatorInfo {
2175 operator: op.to_string(),
2176 delimiter: '\0', // Will be set when we see the delimiter
2177 start_pos: start,
2178 });
2179
2180 // Don't return a keyword token - continue to parse the delimiter
2181 // Skip any whitespace between operator and delimiter
2182 while let Some(ch) = self.current_char() {
2183 if ch.is_whitespace() {
2184 self.advance();
2185 } else {
2186 break;
2187 }
2188 }
2189
2190 // Get the delimiter
2191 #[allow(clippy::collapsible_if)]
2192 if let Some(delim) = self.current_char() {
2193 if !delim.is_alphanumeric() {
2194 self.advance();
2195 if let Some(ref mut info) = self.current_quote_op {
2196 info.delimiter = delim;
2197 }
2198 // Parse the quote operator content and return the complete token
2199 return self.parse_quote_operator(delim);
2200 }
2201 }
2202 } else {
2203 // Not a quote operator here → treat as IDENTIFIER
2204 self.current_quote_op = None;
2205 self.mode = LexerMode::ExpectOperator;
2206 return Some(Token {
2207 token_type: TokenType::Identifier(Arc::from(text)),
2208 start,
2209 end: self.position,
2210 text: Arc::from(text),
2211 });
2212 }
2213 } else {
2214 // End-of-input after the word → also treat as IDENTIFIER
2215 self.current_quote_op = None;
2216 self.mode = LexerMode::ExpectOperator;
2217 return Some(Token {
2218 token_type: TokenType::Identifier(Arc::from(text)),
2219 start,
2220 end: self.position,
2221 text: Arc::from(text),
2222 });
2223 }
2224 // If we get here but haven't returned, something went wrong
2225 // Fall through to treat as identifier
2226 self.current_quote_op = None;
2227 self.mode = LexerMode::ExpectOperator;
2228 return Some(Token {
2229 token_type: TokenType::Identifier(Arc::from(text)),
2230 start,
2231 end: self.position,
2232 text: Arc::from(text),
2233 });
2234 }
2235 // Format declarations need special handling
2236 "format" => {
2237 // We'll need to check for the = after the format name
2238 // For now, just mark that we saw format
2239 }
2240 _ if is_builtin_function(text) => {
2241 // Bare builtins are term-introducing in Perl.
2242 self.mode = LexerMode::ExpectTerm;
2243 }
2244 _ => {
2245 self.mode = LexerMode::ExpectOperator;
2246 }
2247 }
2248 TokenType::Keyword(Arc::from(text))
2249 } else {
2250 // Mirror parser bare-builtin handling so `/` after builtins like
2251 // `join` or `print` is lexed as a regex term, not division.
2252 if is_builtin_function(text) {
2253 self.mode = LexerMode::ExpectTerm;
2254 } else {
2255 self.mode = LexerMode::ExpectOperator;
2256 }
2257 TokenType::Identifier(Arc::from(text))
2258 };
2259
2260 self.after_arrow = false;
2261 // A keyword/identifier is not a variable; `{` after it is a block opener.
2262 self.after_var_subscript = false;
2263 // hash_brace_depth is managed by { and } handlers, not cleared per-token
2264 Some(Token { token_type, text: Arc::from(text), start, end: self.position })
2265 } else {
2266 None
2267 }
2268 }
2269
2270 /// Parse data section body - consumes everything to EOF
2271 fn parse_data_body(&mut self) -> Option<Token> {
2272 if self.position >= self.input.len() {
2273 // Already at EOF
2274 self.mode = LexerMode::ExpectTerm;
2275 return Some(Token {
2276 token_type: TokenType::EOF,
2277 text: Arc::from(""),
2278 start: self.position,
2279 end: self.position,
2280 });
2281 }
2282
2283 let start = self.position;
2284 // Consume everything to EOF
2285 let body = &self.input[self.position..];
2286 self.position = self.input.len();
2287
2288 // Reset mode for next parse (though we're at EOF)
2289 self.mode = LexerMode::ExpectTerm;
2290
2291 Some(Token {
2292 token_type: TokenType::DataBody(Arc::from(body)),
2293 text: Arc::from(body),
2294 start,
2295 end: self.position,
2296 })
2297 }
2298
2299 /// Parse format body - consumes until a line with just a dot
2300 fn parse_format_body(&mut self) -> Option<Token> {
2301 let start = self.position;
2302 let mut body = String::new();
2303 let mut line_start = true;
2304
2305 while self.position < self.input.len() {
2306 // Check if we're at the start of a line and the next char is a dot
2307 if line_start && self.current_char() == Some('.') {
2308 // Check if this line contains only a dot
2309 let mut peek_pos = self.position + 1;
2310 let mut found_terminator = true;
2311
2312 // Skip any trailing whitespace on the dot line
2313 while peek_pos < self.input.len() {
2314 match self.input_bytes[peek_pos] {
2315 b' ' | b'\t' | b'\r' => peek_pos += 1,
2316 b'\n' => break,
2317 _ => {
2318 found_terminator = false;
2319 break;
2320 }
2321 }
2322 }
2323
2324 if found_terminator {
2325 // We found the terminating dot, consume it
2326 self.position = peek_pos;
2327 if self.position < self.input.len() && self.input_bytes[self.position] == b'\n'
2328 {
2329 self.position += 1;
2330 }
2331
2332 // Switch back to normal mode
2333 self.mode = LexerMode::ExpectTerm;
2334
2335 return Some(Token {
2336 token_type: TokenType::FormatBody(Arc::from(body.clone())),
2337 text: Arc::from(body),
2338 start,
2339 end: self.position,
2340 });
2341 }
2342 }
2343
2344 // Not a terminator, consume the character
2345 match self.current_char() {
2346 Some(ch) => {
2347 body.push(ch);
2348 self.advance();
2349
2350 // Track if we're at the start of a line
2351 line_start = ch == '\n';
2352 }
2353 None => {
2354 // Reached EOF without finding terminator
2355 break;
2356 }
2357 }
2358 }
2359
2360 // If we reach here, we didn't find a terminator
2361 self.mode = LexerMode::ExpectTerm;
2362 Some(Token {
2363 token_type: TokenType::Error(Arc::from("Unterminated format body")),
2364 text: Arc::from(body),
2365 start,
2366 end: self.position,
2367 })
2368 }
2369
2370 fn try_operator(&mut self) -> Option<Token> {
2371 // Skip operator parsing if we're expecting a delimiter for a quote operator
2372 if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2373 return None;
2374 }
2375
2376 let start = self.position;
2377 let ch = self.current_char()?;
2378
2379 // ═══════════════════════════════════════════════════════════════════════
2380 // SLASH DISAMBIGUATION STRATEGY (Issue #422)
2381 // ═══════════════════════════════════════════════════════════════════════
2382 //
2383 // Perl's `/` character is ambiguous:
2384 // - Division operator: `$x / 2`
2385 // - Regex delimiter: `/pattern/`
2386 // - Defined-or operator: `$x // $y`
2387 //
2388 // **Disambiguation Strategy (Context-Aware Heuristics):**
2389 //
2390 // 1. **Mode-Based Decision (Primary)**:
2391 // - `LexerMode::ExpectTerm` → `/` starts a regex
2392 // Examples: `if (/pattern/)`, `=~ /test/`, `( /regex/`
2393 // - `LexerMode::ExpectOperator` → `/` is division or `//`
2394 // Examples: `$x / 2`, `$x // $y`, `) / 3`
2395 //
2396 // 2. **Context Heuristics (Secondary - Implicit in Mode)**:
2397 // Mode is set based on previous token:
2398 // - After identifier/number/closing paren → ExpectOperator → division
2399 // - After operator/keyword/opening paren → ExpectTerm → regex
2400 //
2401 // 3. **Budget Protection**:
2402 // - Regex parsing has a parse-step budget and byte budget
2403 // - Budget exceeded → emit UnknownRest token (graceful degradation)
2404 // - See `parse_regex()` and `budget_guard()` for implementation
2405 //
2406 // 4. **Performance Characteristics**:
2407 // - Single-pass: O(1) decision based on mode flag
2408 // - No backtracking: Mode updated after each token
2409 // - Optimized: Byte-level operations for common cases
2410 //
2411 // **Metrics & Monitoring**:
2412 // - Budget exceeded events tracked via UnknownRest token emission
2413 // - LSP diagnostics generated for truncated regexes
2414 // - Test coverage: lexer_slash_timeout_tests.rs (21 test cases)
2415 //
2416 // ═══════════════════════════════════════════════════════════════════════
2417
2418 if ch == '/' {
2419 if self.mode == LexerMode::ExpectTerm {
2420 // Mode indicates we're expecting a term → `/` starts a regex
2421 // Examples: `if (/pattern/)`, `=~ /test/`, `while (/match/)`
2422 return self.parse_regex(start);
2423 } else {
2424 // Mode indicates we're expecting an operator → `/` is division or `//`
2425 // Examples: `$x / 2`, `$x // $y`, `10 / 3`
2426 self.advance();
2427 // Check for // or //= using byte-level operations for speed
2428 if self.peek_byte(0) == Some(b'/') {
2429 self.position += 1; // consume second / directly
2430 if self.peek_byte(0) == Some(b'=') {
2431 self.position += 1; // consume = directly
2432 let text = &self.input[start..self.position];
2433 self.mode = LexerMode::ExpectTerm;
2434 return Some(Token {
2435 token_type: TokenType::Operator(Arc::from(text)),
2436 text: Arc::from(text),
2437 start,
2438 end: self.position,
2439 });
2440 } else {
2441 // Use cached string for common "//" operator
2442 self.mode = LexerMode::ExpectTerm;
2443 return Some(Token {
2444 token_type: TokenType::Operator(Arc::from("//")),
2445 text: Arc::from("//"),
2446 start,
2447 end: self.position,
2448 });
2449 }
2450 } else if self.position < self.input_bytes.len()
2451 && self.input_bytes[self.position] == b'='
2452 {
2453 // /= division-assign operator
2454 self.position += 1; // consume =
2455 self.mode = LexerMode::ExpectTerm;
2456 return Some(Token {
2457 token_type: TokenType::Operator(Arc::from("/=")),
2458 text: Arc::from("/="),
2459 start,
2460 end: self.position,
2461 });
2462 } else {
2463 // Use cached string for common "/" division
2464 self.mode = LexerMode::ExpectTerm;
2465 return Some(Token {
2466 token_type: TokenType::Division,
2467 text: Arc::from("/"),
2468 start,
2469 end: self.position,
2470 });
2471 }
2472 }
2473 }
2474
2475 // Handle other operators - simplified
2476 match ch {
2477 '.' => {
2478 // Check if it's a decimal number like .5 -- but only when we
2479 // expect a term. In operator position `.5` is concatenation
2480 // of the bareword/number on the left with the number `5`.
2481 if self.mode != LexerMode::ExpectOperator
2482 && self.peek_char(1).is_some_and(|c| c.is_ascii_digit())
2483 {
2484 return self.parse_decimal_number(start);
2485 }
2486 self.advance();
2487 // Check for compound operators
2488 #[allow(clippy::collapsible_if)]
2489 if let Some(next) = self.current_char() {
2490 if is_compound_operator(ch, next) {
2491 self.advance();
2492
2493 // Check for three-character operators like **=, <<=, >>=
2494 if self.position < self.input.len() {
2495 let third = self.current_char();
2496 // Check for three-character operators
2497 if matches!(
2498 (ch, next, third),
2499 ('*', '*', Some('='))
2500 | ('<', '<', Some('='))
2501 | ('>', '>', Some('='))
2502 | ('&', '&', Some('='))
2503 | ('|', '|', Some('='))
2504 | ('/', '/', Some('='))
2505 ) {
2506 self.advance(); // consume the =
2507 } else if ch == '<' && next == '=' && third == Some('>') {
2508 self.advance(); // consume the >
2509 // Special case: <=> spaceship operator
2510 } else if ch == '.' && next == '.' && third == Some('.') {
2511 self.advance(); // consume the third .
2512 }
2513 }
2514 }
2515 }
2516 }
2517 '+' | '-' | '*' | '%' | '&' | '|' | '^' | '~' | '!' | '=' | '<' | '>' | ':' | '?'
2518 | '\\' => {
2519 self.advance();
2520 // Check for compound operators
2521 #[allow(clippy::collapsible_if)]
2522 if let Some(next) = self.current_char() {
2523 if is_compound_operator(ch, next) {
2524 self.advance();
2525
2526 // Check for three-character operators like **=, <<=, >>=
2527 if self.position < self.input.len() {
2528 let third = self.current_char();
2529 // Check for three-character operators
2530 if matches!(
2531 (ch, next, third),
2532 ('*', '*', Some('='))
2533 | ('<', '<', Some('='))
2534 | ('>', '>', Some('='))
2535 | ('&', '&', Some('='))
2536 | ('|', '|', Some('='))
2537 | ('/', '/', Some('='))
2538 ) {
2539 self.advance(); // consume the =
2540 } else if ch == '<' && next == '=' && third == Some('>') {
2541 self.advance(); // consume the >
2542 // Special case: <=> spaceship operator
2543 }
2544 }
2545 }
2546 }
2547 }
2548 _ => return None,
2549 }
2550
2551 let text = &self.input[start..self.position];
2552 // Operator ends prototype window (e.g. `:` for attributes)
2553 self.after_sub = false;
2554 // Track whether this operator is '->' for method name disambiguation
2555 self.after_arrow = text == "->";
2556 // Any operator token ends the "just saw a variable" window; `{` after
2557 // an operator is not a hash subscript (e.g. `foo() {`, `+ {`, etc.).
2558 self.after_var_subscript = false;
2559 // Postfix ++ and -- complete a term expression, so next token is an operator
2560 // (e.g., "$x++ / 2" → / is division, not regex)
2561 if (text == "++" || text == "--") && self.mode == LexerMode::ExpectOperator {
2562 // Postfix: stay in ExpectOperator
2563 } else {
2564 self.mode = LexerMode::ExpectTerm;
2565 }
2566
2567 Some(Token {
2568 token_type: TokenType::Operator(Arc::from(text)),
2569 text: Arc::from(text),
2570 start,
2571 end: self.position,
2572 })
2573 }
2574
2575 fn try_delimiter(&mut self) -> Option<Token> {
2576 let start = self.position;
2577 let ch = self.current_char()?;
2578
2579 // If we're expecting a delimiter for a quote operator, handle it specially
2580 if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2581 // Accept any non-alphanumeric character as a delimiter
2582 if !ch.is_alphanumeric() && !ch.is_whitespace() {
2583 self.advance();
2584 if let Some(ref mut info) = self.current_quote_op {
2585 info.delimiter = ch;
2586 }
2587 // Now parse the quote operator content
2588 return self.parse_quote_operator(ch);
2589 }
2590 }
2591
2592 match ch {
2593 '(' => {
2594 // Check if this is a quote operator delimiter
2595 if matches!(self.mode, LexerMode::ExpectDelimiter)
2596 && self.current_quote_op.is_some()
2597 {
2598 self.advance();
2599 if let Some(ref mut info) = self.current_quote_op {
2600 info.delimiter = ch;
2601 }
2602 return self.parse_quote_operator(ch);
2603 }
2604
2605 self.advance();
2606 if self.after_sub {
2607 // Promote after_sub to in_prototype now that we see '('
2608 self.in_prototype = true;
2609 self.after_sub = false;
2610 self.prototype_depth = 1;
2611 } else if self.in_prototype {
2612 self.prototype_depth += 1;
2613 }
2614 self.paren_depth += 1;
2615 self.after_var_subscript = false;
2616 self.mode = LexerMode::ExpectTerm;
2617 Some(Token {
2618 token_type: TokenType::LeftParen,
2619 text: Arc::from("("),
2620 start,
2621 end: self.position,
2622 })
2623 }
2624 ')' => {
2625 self.advance();
2626 if self.in_prototype && self.prototype_depth > 0 {
2627 self.prototype_depth -= 1;
2628 if self.prototype_depth == 0 {
2629 self.in_prototype = false;
2630 }
2631 }
2632 self.after_arrow = false;
2633 self.paren_depth = self.paren_depth.saturating_sub(1);
2634 // A closing paren ends any var-subscript context: `if ($var)` should
2635 // NOT leave after_var_subscript set, otherwise the following `{` would
2636 // incorrectly increment hash_brace_depth and suppress regex operators
2637 // inside the block body (issue #2844).
2638 self.after_var_subscript = false;
2639 self.mode = LexerMode::ExpectOperator;
2640 Some(Token {
2641 token_type: TokenType::RightParen,
2642 text: Arc::from(")"),
2643 start,
2644 end: self.position,
2645 })
2646 }
2647 ';' => {
2648 self.advance();
2649 // Semicolon ends prototype window (forward declaration)
2650 self.after_sub = false;
2651 // Semicolon is a statement boundary — any pending method-call chain is over.
2652 self.after_arrow = false;
2653 self.after_var_subscript = false;
2654 self.mode = LexerMode::ExpectTerm;
2655 Some(Token {
2656 token_type: TokenType::Semicolon,
2657 text: Arc::from(";"),
2658 start,
2659 end: self.position,
2660 })
2661 }
2662 ',' => {
2663 self.advance();
2664 self.after_var_subscript = false;
2665 self.mode = LexerMode::ExpectTerm;
2666 Some(Token {
2667 token_type: TokenType::Comma,
2668 text: Arc::from(","),
2669 start,
2670 end: self.position,
2671 })
2672 }
2673 '[' => {
2674 self.advance();
2675 self.after_var_subscript = false;
2676 self.mode = LexerMode::ExpectTerm;
2677 Some(Token {
2678 token_type: TokenType::LeftBracket,
2679 text: Arc::from("["),
2680 start,
2681 end: self.position,
2682 })
2683 }
2684 ']' => {
2685 self.advance();
2686 // A closing `]` from an array subscript leaves us in a state where
2687 // a `{` immediately following is a hash subscript — e.g. `$arr[$i]{key}`.
2688 // Set after_var_subscript so the `{` handler recognises it as such.
2689 // This mirrors the `}` handler's behavior when closing a hash subscript.
2690 self.after_var_subscript = true;
2691 self.mode = LexerMode::ExpectOperator;
2692 Some(Token {
2693 token_type: TokenType::RightBracket,
2694 text: Arc::from("]"),
2695 start,
2696 end: self.position,
2697 })
2698 }
2699 '{' => {
2700 self.advance();
2701 // Opening brace ends prototype window — no prototype follows
2702 self.after_sub = false;
2703 // `{` is a hash/slice subscript opener only when it immediately follows
2704 // a variable token ($x, @x, %x) — tracked by `after_var_subscript`.
2705 // This is narrower than the old `mode == ExpectOperator` check, which
2706 // incorrectly incremented depth for block-opening braces after `sub foo`,
2707 // `if (cond)`, `else`, `while (cond)`, etc., causing quote-op suppression
2708 // inside those block bodies and breaking m//, s///, qr//, tr/// etc.
2709 if self.after_var_subscript {
2710 self.hash_brace_depth = self.hash_brace_depth.saturating_add(1);
2711 }
2712 self.after_var_subscript = false;
2713 self.mode = LexerMode::ExpectTerm;
2714 Some(Token {
2715 token_type: TokenType::LeftBrace,
2716 text: Arc::from("{"),
2717 start,
2718 end: self.position,
2719 })
2720 }
2721 '}' => {
2722 self.advance();
2723 self.after_arrow = false;
2724 // Decrement hash subscript brace depth only if we were inside one.
2725 // If depth > 0, this closes a hash subscript; enable chained subscripts
2726 // like $h{a}{b} by setting after_var_subscript so the next `{` is
2727 // recognized as another subscript opener.
2728 if self.hash_brace_depth > 0 {
2729 self.hash_brace_depth -= 1;
2730 // The subscript value is now the "variable" for a chained subscript.
2731 self.after_var_subscript = true;
2732 } else {
2733 // Block-close `}` — no subscript follows
2734 self.after_var_subscript = false;
2735 }
2736 self.mode = LexerMode::ExpectOperator;
2737 Some(Token {
2738 token_type: TokenType::RightBrace,
2739 text: Arc::from("}"),
2740 start,
2741 end: self.position,
2742 })
2743 }
2744 '#' => {
2745 // Only treat as delimiter in ExpectDelimiter mode
2746 if matches!(self.mode, LexerMode::ExpectDelimiter) {
2747 self.advance();
2748 // Reset mode after consuming delimiter
2749 self.mode = LexerMode::ExpectTerm;
2750 Some(Token {
2751 token_type: TokenType::Operator(Arc::from("#")),
2752 text: Arc::from("#"),
2753 start,
2754 end: self.position,
2755 })
2756 } else {
2757 None
2758 }
2759 }
2760 _ => None,
2761 }
2762 }
2763
2764 fn parse_double_quoted_string(&mut self, start: usize) -> Option<Token> {
2765 self.advance(); // Skip opening quote
2766 let mut parts = Vec::new();
2767 let mut current_literal = String::new();
2768 let mut last_pos = self.position;
2769
2770 while let Some(ch) = self.current_char() {
2771 match ch {
2772 '"' => {
2773 self.advance();
2774 if !current_literal.is_empty() {
2775 parts.push(StringPart::Literal(Arc::from(current_literal)));
2776 }
2777
2778 let text = &self.input[start..self.position];
2779 self.mode = LexerMode::ExpectOperator;
2780
2781 return Some(Token {
2782 token_type: if parts.is_empty() {
2783 TokenType::StringLiteral
2784 } else {
2785 TokenType::InterpolatedString(parts)
2786 },
2787 text: Arc::from(text),
2788 start,
2789 end: self.position,
2790 });
2791 }
2792 '\\' => {
2793 self.advance();
2794 if let Some(escaped) = self.current_char() {
2795 // Optimize by reserving space to avoid frequent reallocations
2796 if current_literal.capacity() == 0 {
2797 current_literal.reserve(32);
2798 }
2799 current_literal.push('\\');
2800 current_literal.push(escaped);
2801 self.advance();
2802 }
2803 }
2804 '$' if self.config.parse_interpolation => {
2805 // Handle variable interpolation - avoid unnecessary clone
2806 if !current_literal.is_empty() {
2807 parts.push(StringPart::Literal(Arc::from(current_literal)));
2808 current_literal = String::new(); // Clear without cloning
2809 }
2810
2811 let part_start = self.position;
2812 self.advance();
2813 match self.current_char() {
2814 Some('{') => {
2815 let _ = self.consume_balanced_segment_in_string('{', '}', '"');
2816 parts.push(StringPart::Expression(Arc::from(
2817 &self.input[part_start..self.position],
2818 )));
2819 }
2820 Some(ch) if is_perl_identifier_start(ch) => {
2821 let var_start = self.position;
2822
2823 // Fast path for ASCII identifier continuation
2824 while self.position < self.input_bytes.len() {
2825 let byte = self.input_bytes[self.position];
2826 if byte.is_ascii_alphanumeric() || byte == b'_' {
2827 self.position += 1;
2828 } else if byte >= 128 {
2829 // Only use UTF-8 parsing for non-ASCII
2830 if let Some(ch) = self.current_char() {
2831 if is_perl_identifier_continue(ch) {
2832 self.advance();
2833 } else {
2834 break;
2835 }
2836 } else {
2837 break;
2838 }
2839 } else {
2840 break;
2841 }
2842 }
2843
2844 if self.position > var_start {
2845 let var_name = &self.input[part_start..self.position];
2846 parts.push(StringPart::Variable(Arc::from(var_name)));
2847
2848 if self.matches_bytes(b"->") {
2849 let tail_start = self.position;
2850 self.advance();
2851 self.advance();
2852
2853 match self.current_char() {
2854 Some('[') => {
2855 let _ = self
2856 .consume_balanced_segment_in_string('[', ']', '"');
2857 parts.push(StringPart::MethodCall(Arc::from(
2858 &self.input[tail_start..self.position],
2859 )));
2860 }
2861 Some('{') => {
2862 let _ = self
2863 .consume_balanced_segment_in_string('{', '}', '"');
2864 parts.push(StringPart::MethodCall(Arc::from(
2865 &self.input[tail_start..self.position],
2866 )));
2867 }
2868 Some('(') => {
2869 let _ = self
2870 .consume_balanced_segment_in_string('(', ')', '"');
2871 parts.push(StringPart::MethodCall(Arc::from(
2872 &self.input[tail_start..self.position],
2873 )));
2874 }
2875 Some(ch) if is_perl_identifier_start(ch) => {
2876 while self.position < self.input_bytes.len() {
2877 let byte = self.input_bytes[self.position];
2878 if byte.is_ascii_alphanumeric() || byte == b'_' {
2879 self.position += 1;
2880 } else if byte >= 128 {
2881 if let Some(ch) = self.current_char() {
2882 if is_perl_identifier_continue(ch) {
2883 self.advance();
2884 } else {
2885 break;
2886 }
2887 } else {
2888 break;
2889 }
2890 } else {
2891 break;
2892 }
2893 }
2894 if self.current_char() == Some('(') {
2895 let _ = self.consume_balanced_segment_in_string(
2896 '(', ')', '"',
2897 );
2898 }
2899 parts.push(StringPart::MethodCall(Arc::from(
2900 &self.input[tail_start..self.position],
2901 )));
2902 }
2903 _ => {
2904 parts.push(StringPart::MethodCall(Arc::from(
2905 &self.input[tail_start..self.position],
2906 )));
2907 }
2908 }
2909 } else if self.current_char() == Some('[') {
2910 let tail_start = self.position;
2911 let _ = self.consume_balanced_segment_in_string('[', ']', '"');
2912 parts.push(StringPart::ArraySlice(Arc::from(
2913 &self.input[tail_start..self.position],
2914 )));
2915 } else if self.current_char() == Some('{') {
2916 let tail_start = self.position;
2917 let _ = self.consume_balanced_segment_in_string('{', '}', '"');
2918 parts.push(StringPart::Expression(Arc::from(
2919 &self.input[tail_start..self.position],
2920 )));
2921 }
2922 }
2923 }
2924 _ => {}
2925 }
2926 }
2927 _ => {
2928 // Optimize string building with better capacity management
2929 if current_literal.capacity() == 0 {
2930 current_literal.reserve(32);
2931 }
2932 current_literal.push(ch);
2933 self.advance();
2934 }
2935 }
2936
2937 // Safety check: ensure we're making progress
2938 if self.position == last_pos {
2939 break;
2940 }
2941 last_pos = self.position;
2942 }
2943
2944 Some(self.unterminated_string_error(start))
2945 }
2946
2947 fn parse_single_quoted_string(&mut self, start: usize) -> Option<Token> {
2948 self.advance(); // Skip opening quote
2949
2950 let mut last_pos = self.position;
2951
2952 while let Some(ch) = self.current_char() {
2953 match ch {
2954 '\'' => {
2955 self.advance();
2956 let text = &self.input[start..self.position];
2957 self.mode = LexerMode::ExpectOperator;
2958
2959 return Some(Token {
2960 token_type: TokenType::StringLiteral,
2961 text: Arc::from(text),
2962 start,
2963 end: self.position,
2964 });
2965 }
2966 '\\' => {
2967 self.advance();
2968 if self.current_char() == Some('\'') || self.current_char() == Some('\\') {
2969 self.advance();
2970 }
2971 }
2972 _ => self.advance(),
2973 }
2974
2975 // Safety check: ensure we're making progress
2976 if self.position == last_pos {
2977 break;
2978 }
2979 last_pos = self.position;
2980 }
2981
2982 Some(self.unterminated_string_error(start))
2983 }
2984
2985 fn parse_backtick_string(&mut self, start: usize) -> Option<Token> {
2986 self.advance(); // Skip opening backtick
2987
2988 let mut last_pos = self.position;
2989
2990 while let Some(ch) = self.current_char() {
2991 match ch {
2992 '`' => {
2993 self.advance();
2994 let text = &self.input[start..self.position];
2995 self.mode = LexerMode::ExpectOperator;
2996
2997 return Some(Token {
2998 token_type: TokenType::QuoteCommand,
2999 text: Arc::from(text),
3000 start,
3001 end: self.position,
3002 });
3003 }
3004 '\\' => {
3005 self.advance();
3006 if self.current_char().is_some() {
3007 self.advance();
3008 }
3009 }
3010 _ => self.advance(),
3011 }
3012
3013 // Safety check: ensure we're making progress
3014 if self.position == last_pos {
3015 break;
3016 }
3017 last_pos = self.position;
3018 }
3019
3020 Some(self.unterminated_string_error(start))
3021 }
3022
3023 fn parse_q_string(&mut self, _start: usize) -> Option<Token> {
3024 // Simplified q-string parsing
3025 None
3026 }
3027
3028 #[inline]
3029 fn unterminated_string_error(&mut self, start: usize) -> Token {
3030 // Consume to EOF so the caller receives a single terminal error token.
3031 let end = self.input.len();
3032 self.position = end;
3033
3034 Token {
3035 token_type: TokenType::Error(Arc::from("unterminated string")),
3036 text: Arc::from(&self.input[start..end]),
3037 start,
3038 end,
3039 }
3040 }
3041
3042 fn parse_substitution(&mut self, start: usize) -> Option<Token> {
3043 // We've already consumed 's'
3044 let delimiter = self.current_char()?;
3045 self.advance(); // Skip delimiter
3046 self.parse_substitution_with_delimiter(start, delimiter)
3047 }
3048
3049 fn parse_substitution_with_delimiter(
3050 &mut self,
3051 start: usize,
3052 delimiter: char,
3053 ) -> Option<Token> {
3054 self.read_delimited_body(delimiter);
3055
3056 let pattern_is_paired = quote_handler::paired_close(delimiter).is_some();
3057 if pattern_is_paired {
3058 while self.current_char().is_some_and(char::is_whitespace) {
3059 self.advance();
3060 }
3061
3062 if let Some(repl_delim) = self.current_char()
3063 && Self::is_quote_delim(repl_delim)
3064 {
3065 self.advance();
3066 self.read_delimited_body(repl_delim);
3067 }
3068 } else {
3069 self.read_delimited_body(delimiter);
3070 }
3071
3072 // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
3073 while let Some(ch) = self.current_char() {
3074 if ch.is_ascii_alphanumeric() {
3075 self.advance();
3076 } else {
3077 break;
3078 }
3079 }
3080
3081 let text = &self.input[start..self.position];
3082 self.mode = LexerMode::ExpectOperator;
3083
3084 Some(Token {
3085 token_type: TokenType::Substitution,
3086 text: Arc::from(text),
3087 start,
3088 end: self.position,
3089 })
3090 }
3091
3092 fn parse_transliteration(&mut self, start: usize) -> Option<Token> {
3093 // We've already consumed 'tr' or 'y'
3094 while self.current_char().is_some_and(char::is_whitespace) {
3095 self.advance();
3096 }
3097
3098 let delimiter = self.current_char()?;
3099 self.advance(); // Skip delimiter
3100 self.parse_transliteration_with_delimiter(start, delimiter)
3101 }
3102
3103 fn parse_transliteration_with_delimiter(
3104 &mut self,
3105 start: usize,
3106 delimiter: char,
3107 ) -> Option<Token> {
3108 self.read_delimited_body(delimiter);
3109
3110 let search_is_paired = quote_handler::paired_close(delimiter).is_some();
3111 if search_is_paired {
3112 while self.current_char().is_some_and(char::is_whitespace) {
3113 self.advance();
3114 }
3115
3116 if let Some(repl_delim) = self.current_char()
3117 && Self::is_quote_delim(repl_delim)
3118 {
3119 self.advance();
3120 self.read_delimited_body(repl_delim);
3121 }
3122 } else {
3123 self.read_delimited_body(delimiter);
3124 }
3125
3126 // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
3127 while let Some(ch) = self.current_char() {
3128 if ch.is_ascii_alphanumeric() {
3129 self.advance();
3130 } else {
3131 break;
3132 }
3133 }
3134
3135 let text = &self.input[start..self.position];
3136 self.mode = LexerMode::ExpectOperator;
3137
3138 Some(Token {
3139 token_type: TokenType::Transliteration,
3140 text: Arc::from(text),
3141 start,
3142 end: self.position,
3143 })
3144 }
3145
3146 /// Read content between delimiters.
3147 ///
3148 /// Returns `(body, closed)` where `closed` is `true` if the closing
3149 /// delimiter was found before EOF, and `false` if EOF was reached first.
3150 fn read_delimited_body(&mut self, delim: char) -> (String, bool) {
3151 let paired = quote_handler::paired_close(delim);
3152 let close = paired.unwrap_or(delim);
3153 let mut body = String::new();
3154 let mut depth = i32::from(paired.is_some());
3155
3156 while let Some(ch) = self.current_char() {
3157 if ch == '\\' {
3158 body.push(ch);
3159 self.advance();
3160 if let Some(next) = self.current_char() {
3161 body.push(next);
3162 self.advance();
3163 }
3164 continue;
3165 }
3166
3167 if paired.is_some() && ch == delim {
3168 body.push(ch);
3169 self.advance();
3170 depth += 1;
3171 continue;
3172 }
3173
3174 if ch == close {
3175 if paired.is_some() {
3176 depth -= 1;
3177 if depth == 0 {
3178 self.advance();
3179 return (body, true);
3180 }
3181 body.push(ch);
3182 self.advance();
3183 } else {
3184 self.advance();
3185 return (body, true);
3186 }
3187 continue;
3188 }
3189
3190 body.push(ch);
3191 self.advance();
3192 }
3193
3194 // EOF reached without finding the closing delimiter
3195 (body, false)
3196 }
3197
3198 /// Parse a quote operator after we've seen the delimiter
3199 fn parse_quote_operator(&mut self, delimiter: char) -> Option<Token> {
3200 let info = self.current_quote_op.as_ref()?;
3201 let start = info.start_pos;
3202 let operator = info.operator.clone();
3203
3204 // Clear the quote-op context eagerly so any early-return path (s/tr/y delegations
3205 // below) does not leave a stale reference behind. The post-match cleanup at the
3206 // bottom of this function would otherwise be skipped for those operators.
3207 self.current_quote_op = None;
3208
3209 // Parse based on operator type; track whether all delimiters were closed.
3210 let closed = match operator.as_str() {
3211 "s" => {
3212 return self.parse_substitution_with_delimiter(start, delimiter);
3213 }
3214 "tr" | "y" => {
3215 return self.parse_transliteration_with_delimiter(start, delimiter);
3216 }
3217 "qr" => {
3218 let (_pattern, body_closed) = self.read_delimited_body(delimiter);
3219 self.parse_regex_modifiers("e_handler::QR_SPEC);
3220 body_closed
3221 }
3222 "m" => {
3223 let (_pattern, body_closed) = self.read_delimited_body(delimiter);
3224 self.parse_regex_modifiers("e_handler::M_SPEC);
3225 body_closed
3226 }
3227 _ => {
3228 // q, qq, qw, qx - no modifiers
3229 let (_body, body_closed) = self.read_delimited_body(delimiter);
3230 body_closed
3231 }
3232 };
3233
3234 let text = &self.input[start..self.position];
3235
3236 self.mode = LexerMode::ExpectOperator;
3237
3238 if !closed {
3239 // EOF reached before finding the closing delimiter — emit an error
3240 // token so the parser's recovery mechanism records a diagnostic.
3241 return Some(Token {
3242 token_type: TokenType::Error(Arc::from(format!(
3243 "unclosed {} delimiter '{}'",
3244 operator, delimiter
3245 ))),
3246 text: Arc::from(text),
3247 start,
3248 end: self.position,
3249 });
3250 }
3251
3252 let token_type = quote_handler::get_quote_token_type(&operator);
3253 Some(Token { token_type, text: Arc::from(text), start, end: self.position })
3254 }
3255
3256 /// Parse regex modifiers according to the given spec
3257 ///
3258 /// This function includes ALL characters that could be intended as modifiers,
3259 /// including invalid ones. This allows the parser to properly reject invalid
3260 /// modifiers with a clear error message, rather than leaving them as separate
3261 /// tokens that could be confusingly parsed.
3262 fn parse_regex_modifiers(&mut self, _spec: "e_handler::ModSpec) {
3263 // Consume all alphanumeric characters that could be intended as modifiers
3264 // The parser will validate and reject invalid ones
3265 while let Some(ch) = self.current_char() {
3266 if ch.is_ascii_alphanumeric() {
3267 self.advance();
3268 } else {
3269 break;
3270 }
3271 }
3272 // Note: We no longer validate here - the parser will validate and provide
3273 // clear error messages for invalid modifiers (MUT_005 fix)
3274 }
3275
3276 /// Parse a regex literal starting with `/`
3277 ///
3278 /// **Budget Protection (Issue #422)**:
3279 /// - Budget guards prevent runaway scanning on pathological input
3280 /// - `MAX_REGEX_PARSE_STEPS` bounds literal scanning before the byte budget
3281 /// - `MAX_REGEX_BYTES` bounds total bytes consumed in a single regex literal
3282 /// - Graceful degradation: emit UnknownRest token if budget exceeded
3283 ///
3284 /// **Performance**:
3285 /// - Single-pass scanning with escape handling
3286 /// - Budget check per iteration (amortized O(1) via inline fast path)
3287 /// - Typical regex: <10μs, Large regex (64KB): ~1ms
3288 fn parse_regex(&mut self, start: usize) -> Option<Token> {
3289 self.advance(); // Skip opening /
3290
3291 let mut regex_parse_steps: usize = 0;
3292 let mut in_character_class = false;
3293
3294 while let Some(ch) = self.current_char() {
3295 regex_parse_steps += 1;
3296 if regex_parse_steps > MAX_REGEX_PARSE_STEPS {
3297 #[cfg(debug_assertions)]
3298 {
3299 let text = &self.input[start..self.position];
3300 let preview = truncate_preview(text, 50);
3301 tracing::debug!(
3302 limit = MAX_REGEX_PARSE_STEPS,
3303 pattern_preview = %preview,
3304 "Regex parse step budget exceeded"
3305 );
3306 }
3307 self.position = self.input.len();
3308 return Some(Token {
3309 token_type: TokenType::UnknownRest,
3310 text: empty_arc(),
3311 start,
3312 end: self.position,
3313 });
3314 }
3315
3316 // Budget guard: prevent timeout on pathological input (Issue #422)
3317 // If exceeded, returns UnknownRest token for graceful degradation
3318 if let Some(token) = self.budget_guard(start, 0) {
3319 return Some(token);
3320 }
3321
3322 match ch {
3323 '/' if !in_character_class => {
3324 self.advance();
3325 // Parse flags - include all alphanumeric for proper validation in parser (MUT_005 fix)
3326 while let Some(ch) = self.current_char() {
3327 if ch.is_ascii_alphanumeric() {
3328 self.advance();
3329 } else {
3330 break;
3331 }
3332 }
3333
3334 let text = &self.input[start..self.position];
3335 self.mode = LexerMode::ExpectOperator;
3336
3337 return Some(Token {
3338 token_type: TokenType::RegexMatch,
3339 text: Arc::from(text),
3340 start,
3341 end: self.position,
3342 });
3343 }
3344 '\\' => {
3345 // Handle escape sequences: consume backslash + next char
3346 self.advance();
3347 if self.current_char().is_some() {
3348 self.advance();
3349 }
3350 }
3351 '[' => {
3352 in_character_class = true;
3353 self.advance();
3354 }
3355 ']' if in_character_class => {
3356 in_character_class = false;
3357 self.advance();
3358 }
3359 _ => self.advance(),
3360 }
3361 }
3362
3363 // Unterminated regex - EOF reached before closing /
3364 // Parser will emit diagnostic for unterminated literal
3365 None
3366 }
3367}
3368
3369// Pre-allocated empty Arc to avoid repeated allocations
3370static EMPTY_ARC: OnceLock<Arc<str>> = OnceLock::new();
3371
3372#[inline(always)]
3373fn empty_arc() -> Arc<str> {
3374 EMPTY_ARC.get_or_init(|| Arc::from("")).clone()
3375}
3376
3377fn truncate_preview(text: &str, max_chars: usize) -> String {
3378 match text.char_indices().nth(max_chars) {
3379 Some((idx, _)) => format!("{}...", &text[..idx]),
3380 None => text.to_string(),
3381 }
3382}
3383
3384#[inline(always)]
3385fn is_keyword_fast(word: &str) -> bool {
3386 // Fast length-based rejection for most cases.
3387 // Lexer keywords are currently bounded to 1..=9 characters.
3388 matches!(word.len(), 1..=9) && is_lexer_keyword(word)
3389}
3390
3391#[inline]
3392fn is_builtin_function(word: &str) -> bool {
3393 BARE_TERM_BUILTINS.binary_search(&word).is_ok()
3394}
3395
3396#[inline(always)]
3397fn is_quote_op_word_prefix(word: &[u8]) -> bool {
3398 matches!(word, b"m" | b"q" | b"qq" | b"qw" | b"qx" | b"qr")
3399}
3400
3401const BARE_TERM_BUILTINS: &[&str] = &[
3402 "abs", "chomp", "chop", "chr", "close", "defined", "delete", "each", "exists", "hex", "int",
3403 "join", "keys", "lc", "lcfirst", "length", "oct", "open", "ord", "pack", "print", "push",
3404 "read", "ref", "reverse", "rindex", "say", "scalar", "splice", "sprintf", "sqrt", "substr",
3405 "tie", "uc", "ucfirst", "unpack", "unshift", "untie", "values", "write",
3406];
3407
3408/// Fast lookup table for compound operator second characters
3409const COMPOUND_SECOND_CHARS: &[u8] = b"=<>&|+->.~*:";
3410
3411#[inline]
3412fn is_compound_operator(first: char, second: char) -> bool {
3413 // Optimized compound operator lookup using perfect hashing for common cases
3414 // Convert to bytes for faster comparison (most operators are ASCII)
3415 if first.is_ascii() && second.is_ascii() {
3416 let first_byte = first as u8;
3417 let second_byte = second as u8;
3418
3419 if !COMPOUND_SECOND_CHARS.contains(&second_byte) {
3420 return false;
3421 }
3422
3423 // Use lookup table approach for maximum performance
3424 match (first_byte, second_byte) {
3425 // Assignment operators
3426 (b'+' | b'-' | b'*' | b'/' | b'%' | b'&' | b'|' | b'^' | b'.', b'=') => true,
3427
3428 // Comparison operators
3429 (b'<' | b'>' | b'=' | b'!', b'=') => true,
3430
3431 // Pattern operators
3432 (b'=' | b'!', b'~') => true,
3433
3434 // Increment/decrement
3435 (b'+', b'+') | (b'-', b'-') => true,
3436
3437 // Logical operators
3438 (b'&', b'&') | (b'|', b'|') => true,
3439
3440 // Shift operators
3441 (b'<', b'<') | (b'>', b'>') => true,
3442
3443 // Other compound operators
3444 (b'*', b'*')
3445 | (b'/', b'/')
3446 | (b'-' | b'=', b'>')
3447 | (b'.', b'.')
3448 | (b'~', b'~')
3449 | (b':', b':') => true,
3450
3451 _ => false,
3452 }
3453 } else {
3454 // Fallback for non-ASCII (should be rare)
3455 matches!(
3456 (first, second),
3457 ('+' | '-' | '*' | '/' | '%' | '&' | '|' | '^' | '.' | '<' | '>' | '=' | '!', '=')
3458 | ('=' | '!' | '~', '~')
3459 | ('+', '+')
3460 | ('-', '-' | '>')
3461 | ('&', '&')
3462 | ('|', '|')
3463 | ('<', '<')
3464 | ('>' | '=', '>')
3465 | ('*', '*')
3466 | ('/', '/')
3467 | ('.', '.')
3468 | (':', ':')
3469 )
3470 }
3471}
3472
3473// Checkpoint support for incremental parsing
3474impl Checkpointable for PerlLexer<'_> {
3475 fn checkpoint(&self) -> LexerCheckpoint {
3476 use checkpoint::CheckpointContext;
3477
3478 // Determine the checkpoint context based on current state
3479 let context = if matches!(self.mode, LexerMode::InFormatBody) {
3480 CheckpointContext::Format {
3481 start_position: self.position.saturating_sub(100), // Approximate
3482 }
3483 } else if !self.delimiter_stack.is_empty() {
3484 // We're in some kind of quote-like construct
3485 CheckpointContext::QuoteLike {
3486 operator: String::new(), // Would need to track this
3487 delimiter: self.delimiter_stack.last().copied().unwrap_or('\0'),
3488 is_paired: true,
3489 }
3490 } else {
3491 CheckpointContext::Normal
3492 };
3493
3494 LexerCheckpoint {
3495 position: self.position,
3496 mode: self.mode,
3497 delimiter_stack: self.delimiter_stack.clone(),
3498 in_prototype: self.in_prototype,
3499 prototype_depth: self.prototype_depth,
3500 after_sub: self.after_sub,
3501 after_arrow: self.after_arrow,
3502 hash_brace_depth: self.hash_brace_depth,
3503 after_var_subscript: self.after_var_subscript,
3504 paren_depth: self.paren_depth,
3505 current_pos: self.current_pos,
3506 context,
3507 }
3508 }
3509
3510 fn restore(&mut self, checkpoint: &LexerCheckpoint) {
3511 self.position = checkpoint.position;
3512 self.mode = checkpoint.mode;
3513 self.delimiter_stack.clone_from(&checkpoint.delimiter_stack);
3514 self.in_prototype = checkpoint.in_prototype;
3515 self.prototype_depth = checkpoint.prototype_depth;
3516 self.after_sub = checkpoint.after_sub;
3517 self.after_arrow = checkpoint.after_arrow;
3518 self.hash_brace_depth = checkpoint.hash_brace_depth;
3519 self.after_var_subscript = checkpoint.after_var_subscript;
3520 self.paren_depth = checkpoint.paren_depth;
3521 self.current_pos = checkpoint.current_pos;
3522
3523 // Handle special contexts
3524 use checkpoint::CheckpointContext;
3525 if let CheckpointContext::Format { .. } = &checkpoint.context {
3526 // Ensure we're in format body mode
3527 if !matches!(self.mode, LexerMode::InFormatBody) {
3528 self.mode = LexerMode::InFormatBody;
3529 }
3530 }
3531 }
3532
3533 fn can_restore(&self, checkpoint: &LexerCheckpoint) -> bool {
3534 // Can restore if the position is valid for our input
3535 checkpoint.position <= self.input.len()
3536 }
3537}
3538
3539#[cfg(test)]
3540mod test_format_debug;
3541
3542#[cfg(test)]
3543mod tests {
3544 use super::*;
3545
3546 type TestResult = std::result::Result<(), Box<dyn std::error::Error>>;
3547
3548 #[test]
3549 fn test_basic_tokens() -> TestResult {
3550 let mut lexer = PerlLexer::new("my $x = 42;");
3551
3552 let token = lexer.next_token().ok_or("Expected keyword token")?;
3553 assert_eq!(token.token_type, TokenType::Keyword(Arc::from("my")));
3554
3555 let token = lexer.next_token().ok_or("Expected identifier token")?;
3556 assert!(matches!(token.token_type, TokenType::Identifier(_)));
3557
3558 let token = lexer.next_token().ok_or("Expected operator token")?;
3559 assert!(matches!(token.token_type, TokenType::Operator(_)));
3560
3561 let token = lexer.next_token().ok_or("Expected number token")?;
3562 assert!(matches!(token.token_type, TokenType::Number(_)));
3563
3564 let token = lexer.next_token().ok_or("Expected semicolon token")?;
3565 assert_eq!(token.token_type, TokenType::Semicolon);
3566 Ok(())
3567 }
3568
3569 #[test]
3570 fn test_slash_disambiguation() -> TestResult {
3571 // Division
3572 let mut lexer = PerlLexer::new("10 / 2");
3573 lexer.next_token(); // 10
3574 let token = lexer.next_token().ok_or("Expected division token")?;
3575 assert_eq!(token.token_type, TokenType::Division);
3576
3577 // Regex
3578 let mut lexer = PerlLexer::new("if (/pattern/)");
3579 lexer.next_token(); // if
3580 lexer.next_token(); // (
3581 let token = lexer.next_token().ok_or("Expected regex token")?;
3582 assert_eq!(token.token_type, TokenType::RegexMatch);
3583 Ok(())
3584 }
3585
3586 #[test]
3587 fn test_percent_and_double_sigil_disambiguation() -> TestResult {
3588 // Hash variable
3589 let mut lexer = PerlLexer::new("%hash");
3590 let token = lexer.next_token().ok_or("Expected hash identifier token")?;
3591 assert!(
3592 matches!(token.token_type, TokenType::Identifier(ref id) if id.as_ref() == "%hash")
3593 );
3594
3595 // Modulo operator
3596 let mut lexer = PerlLexer::new("10 % 3");
3597 lexer.next_token(); // 10
3598 let token = lexer.next_token().ok_or("Expected modulo operator token")?;
3599 assert!(matches!(token.token_type, TokenType::Operator(ref op) if op.as_ref() == "%"));
3600 Ok(())
3601 }
3602
3603 #[test]
3604 fn test_defined_or_and_exponent() -> TestResult {
3605 // Defined-or operator
3606 let mut lexer = PerlLexer::new("$a // $b");
3607 lexer.next_token(); // $a
3608 let token = lexer.next_token().ok_or("Expected defined-or operator token")?;
3609 assert!(matches!(token.token_type, TokenType::Operator(ref op) if op.as_ref() == "//"));
3610
3611 // Regex after =~ should still parse
3612 let mut lexer = PerlLexer::new("$x =~ //");
3613 lexer.next_token(); // $x
3614 lexer.next_token(); // =~
3615 let token = lexer.next_token().ok_or("Expected regex token")?;
3616 assert_eq!(token.token_type, TokenType::RegexMatch);
3617
3618 // Exponent operator
3619 let mut lexer = PerlLexer::new("2 ** 3");
3620 lexer.next_token(); // 2
3621 let token = lexer.next_token().ok_or("Expected exponent operator token")?;
3622 assert!(matches!(token.token_type, TokenType::Operator(ref op) if op.as_ref() == "**"));
3623 Ok(())
3624 }
3625
3626 #[test]
3627 fn test_join_regex_disambiguation() -> TestResult {
3628 let mut lexer = PerlLexer::new("join /,/, @parts");
3629 let token = lexer.next_token().ok_or("Expected join token")?;
3630 assert!(matches!(token.token_type, TokenType::Identifier(ref id) if id.as_ref() == "join"));
3631
3632 let token = lexer.next_token().ok_or("Expected regex token")?;
3633 assert_eq!(token.token_type, TokenType::RegexMatch);
3634 Ok(())
3635 }
3636
3637 #[test]
3638 fn test_builtin_regex_disambiguation() -> TestResult {
3639 for code in ["print /pattern/", "defined /pattern/", "keys /pattern/"] {
3640 let mut lexer = PerlLexer::new(code);
3641 lexer.next_token();
3642 let token = lexer.next_token().ok_or("Expected regex token")?;
3643 assert_eq!(token.token_type, TokenType::RegexMatch, "{code}");
3644 }
3645 Ok(())
3646 }
3647
3648 #[test]
3649 fn test_nullary_builtin_division_disambiguation() -> TestResult {
3650 let mut lexer = PerlLexer::new("time / 2");
3651 let token = lexer.next_token().ok_or("Expected time token")?;
3652 assert!(matches!(token.token_type, TokenType::Identifier(ref id) if id.as_ref() == "time"));
3653
3654 let token = lexer.next_token().ok_or("Expected division token")?;
3655 assert_eq!(token.token_type, TokenType::Division);
3656 Ok(())
3657 }
3658
3659 #[test]
3660 fn test_peek_token_does_not_mutate_paren_depth() -> TestResult {
3661 // Regression guard for issue #2750: peek_token() must save and restore
3662 // paren_depth so that a peek at `(` does not permanently increment
3663 // paren_depth and corrupt the heredoc/bitshift guard on a subsequent token.
3664 let mut lexer = PerlLexer::new("(1<<2)");
3665 assert_eq!(lexer.paren_depth, 0, "paren_depth must start at 0");
3666
3667 // Peek at `(` — must not permanently increment paren_depth
3668 let peeked = lexer.peek_token().ok_or("peek at ( failed")?;
3669 assert_eq!(peeked.token_type, TokenType::LeftParen);
3670 assert_eq!(lexer.paren_depth, 0, "peek_token must not mutate paren_depth");
3671
3672 // Consume `(` — paren_depth becomes 1
3673 lexer.next_token();
3674 assert_eq!(lexer.paren_depth, 1);
3675
3676 // Peek at `1` (a number) — paren_depth must remain 1
3677 let peeked2 = lexer.peek_token().ok_or("peek at 1 failed")?;
3678 assert!(matches!(peeked2.token_type, TokenType::Number(_)));
3679 assert_eq!(lexer.paren_depth, 1, "peek at number must not change paren_depth");
3680
3681 Ok(())
3682 }
3683
3684 #[test]
3685 fn test_comment_skipping_with_cr_line_endings() -> TestResult {
3686 let mut lexer = PerlLexer::new("my $x = 1;# comment\rmy $y = 2;");
3687 let mut saw_second_my = false;
3688
3689 while let Some(token) = lexer.next_token() {
3690 if matches!(token.token_type, TokenType::EOF) {
3691 break;
3692 }
3693
3694 if matches!(token.token_type, TokenType::Keyword(ref kw) if kw.as_ref() == "my")
3695 && token.start > 0
3696 {
3697 saw_second_my = true;
3698 }
3699 }
3700
3701 assert!(saw_second_my, "lexer should continue after CR-terminated comment line");
3702 Ok(())
3703 }
3704
3705 #[test]
3706 fn test_pod_skipped_with_cr_only_line_endings() -> TestResult {
3707 // CR-only line endings (classic Mac): =pod and =cut must be detected
3708 // when preceded by \r instead of \n.
3709 let input = "my $before = 1;\r=pod\rThis is documentation.\r=cut\rmy $after = 2;";
3710 let mut lexer = PerlLexer::new(input);
3711 let mut token_texts: Vec<String> = Vec::new();
3712
3713 while let Some(token) = lexer.next_token() {
3714 if matches!(token.token_type, TokenType::EOF) {
3715 break;
3716 }
3717 if matches!(token.token_type, TokenType::Keyword(_) | TokenType::Identifier(_)) {
3718 token_texts.push(token.text.to_string());
3719 }
3720 }
3721
3722 assert!(
3723 token_texts.iter().any(|t| t == "my" && {
3724 // find the second 'my' (after the POD block)
3725 token_texts.iter().enumerate().filter(|(_, t)| t.as_str() == "my").nth(1).is_some()
3726 }),
3727 "lexer should produce tokens after CR-terminated =cut; got: {:?}",
3728 token_texts
3729 );
3730
3731 // Ensure POD body text is not present as an identifier token
3732 assert!(
3733 !token_texts.iter().any(|t| t == "documentation"),
3734 "POD body should be consumed, not emitted as a token; got: {:?}",
3735 token_texts
3736 );
3737 Ok(())
3738 }
3739
3740 #[test]
3741 fn test_exponent_sign_no_digits_plus() -> TestResult {
3742 // .5e+x — 'e' is not a valid exponent (no digits follow), so the number
3743 // token must be ".5" only. The 'e' becomes a separate identifier token.
3744 // Regression: old code produced Number(".5e") by backtracking to the sign
3745 // character instead of to the 'e' itself.
3746 let mut lexer = PerlLexer::new(".5e+x");
3747 let tok1 = lexer.next_token().ok_or("expected first token")?;
3748 assert!(
3749 matches!(&tok1.token_type, TokenType::Number(n) if n.as_ref() == ".5"),
3750 "expected Number(\".5\") but got {:?}",
3751 tok1.token_type
3752 );
3753 // The 'e' must NOT be swallowed into the number token.
3754 let tok2 = lexer.next_token().ok_or("expected second token")?;
3755 assert!(
3756 !matches!(&tok2.token_type, TokenType::Number(_)),
3757 "number token must not include 'e'; second token should not be a Number, got {:?}",
3758 tok2.token_type
3759 );
3760 Ok(())
3761 }
3762
3763 #[test]
3764 fn test_exponent_sign_no_digits_minus() -> TestResult {
3765 // 1.5e-y — 'e' is not a valid exponent (no digits follow), so the number
3766 // token must be "1.5" only. The 'e' becomes a separate identifier token.
3767 // Regression: old code produced Number("1.5e") by backtracking to the '-'
3768 // character instead of to the 'e' itself.
3769 let mut lexer = PerlLexer::new("1.5e-y");
3770 let tok1 = lexer.next_token().ok_or("expected first token")?;
3771 assert!(
3772 matches!(&tok1.token_type, TokenType::Number(n) if n.as_ref() == "1.5"),
3773 "expected Number(\"1.5\") but got {:?}",
3774 tok1.token_type
3775 );
3776 // The 'e' must NOT be swallowed into the number token.
3777 let tok2 = lexer.next_token().ok_or("expected second token")?;
3778 assert!(
3779 !matches!(&tok2.token_type, TokenType::Number(_)),
3780 "number token must not include 'e'; second token should not be a Number, got {:?}",
3781 tok2.token_type
3782 );
3783 Ok(())
3784 }
3785}