perl_lexer/
lib.rs

1//! Context-aware Perl lexer with mode-based tokenization
2//!
3//! This crate provides a high-performance lexer for Perl that handles the inherently
4//! context-sensitive nature of the language. The lexer uses a mode-tracking system to
5//! correctly disambiguate ambiguous syntax like `/` (division vs. regex) and properly
6//! parse complex constructs like heredocs, quote-like operators, and nested delimiters.
7//!
8//! # Architecture
9//!
10//! The lexer is organized around several key concepts:
11//!
12//! - **Mode Tracking**: [`LexerMode`] tracks whether the parser expects a term or an operator,
13//!   enabling correct disambiguation of context-sensitive tokens.
14//! - **Checkpointing**: [`LexerCheckpoint`] and [`Checkpointable`] support incremental parsing
15//!   by allowing the lexer state to be saved and restored.
16//! - **Budget Limits**: Protection against pathological input with configurable size limits
17//!   for regex patterns, heredoc bodies, and delimiter nesting depth.
18//! - **Position Tracking**: [`Position`] maintains line/column information for error reporting
19//!   and LSP integration.
20//! - **Unicode Support**: Full Unicode identifier support following Perl 5.14+ semantics.
21//!
22//! # Usage
23//!
24//! ## Basic Tokenization
25//!
26//! ```rust
27//! use perl_lexer::{PerlLexer, TokenType};
28//!
29//! let mut lexer = PerlLexer::new("my $x = 42;");
30//! let tokens = lexer.collect_tokens();
31//!
32//! // First token is the keyword `my`
33//! assert!(matches!(&tokens[0].token_type, TokenType::Keyword(k) if &**k == "my"));
34//! // Tokens include variables, operators, literals, and EOF
35//! assert!(matches!(&tokens.last().map(|t| &t.token_type), Some(TokenType::EOF)));
36//! ```
37//!
38//! ## Context-Aware Parsing
39//!
40//! The lexer automatically tracks context to disambiguate operators:
41//!
42//! ```rust
43//! use perl_lexer::{PerlLexer, TokenType};
44//!
45//! // Division operator (after a term)
46//! let mut lexer = PerlLexer::new("42 / 2");
47//! // Regex operator (at start of expression)
48//! let mut lexer2 = PerlLexer::new("/pattern/");
49//! ```
50//!
51//! ## Checkpointing for Incremental Parsing
52//!
53//! ```rust,ignore
54//! use perl_lexer::{PerlLexer, Checkpointable};
55//!
56//! let mut lexer = PerlLexer::new("my $x = 1;");
57//! let checkpoint = lexer.checkpoint();
58//!
59//! // Parse some tokens
60//! let _ = lexer.next_token();
61//!
62//! // Restore to checkpoint
63//! lexer.restore(&checkpoint);
64//! ```
65//!
66//! ## Configuration Options
67//!
68//! ```rust
69//! use perl_lexer::{PerlLexer, LexerConfig};
70//!
71//! let config = LexerConfig {
72//!     parse_interpolation: true,  // Parse string interpolation
73//!     track_positions: true,      // Track line/column positions
74//!     max_lookahead: 1024,        // Maximum lookahead for disambiguation
75//! };
76//!
77//! let mut lexer = PerlLexer::with_config("my $x = 1;", config);
78//! ```
79//!
80//! # Context Sensitivity Examples
81//!
82//! Perl's grammar is highly context-sensitive. The lexer handles these cases:
83//!
84//! - **Division vs. Regex**: `/` is division after terms, regex at expression start
85//! - **Modulo vs. Hash Sigil**: `%` is modulo after terms, hash sigil at expression start
86//! - **Glob vs. Exponent**: `**` can be exponentiation or glob pattern start
87//! - **Defined-or vs. Regex**: `//` is defined-or after terms, regex at expression start
88//! - **Heredoc Markers**: `<<` can be left shift, here-doc, or numeric less-than-less-than
89//!
90//! # Budget Limits
91//!
92//! To prevent hangs on pathological input, the lexer enforces these limits:
93//!
94//! - **MAX_REGEX_BYTES**: 64KB maximum for regex patterns
95//! - **MAX_HEREDOC_BYTES**: 256KB maximum for heredoc bodies
96//! - **MAX_DELIM_NEST**: 128 levels maximum nesting depth for delimiters
97//! - **MAX_REGEX_PARSE_STEPS**: 32K maximum scan iterations for regex literals
98//!
99//! When limits are exceeded, the lexer emits an `UnknownRest` token preserving
100//! all previously parsed symbols, allowing continued analysis.
101//!
102//! # Integration with perl-parser
103//!
104//! The lexer is designed to work seamlessly with `perl_parser_core::Parser`.
105//! You rarely need to use the lexer directly -- the parser creates and manages
106//! a `PerlLexer` instance internally:
107//!
108//! ```rust,ignore
109//! use perl_parser_core::Parser;
110//!
111//! let code = r#"sub hello { print "Hello, world!\n"; }"#;
112//! let mut parser = Parser::new(code);
113//! let ast = parser.parse().expect("should parse");
114//! ```
115
116#![warn(clippy::all)]
117#![allow(
118    // Core allows for lexer code
119    clippy::too_many_lines,
120    clippy::module_name_repetitions,
121    clippy::cast_possible_truncation,
122    clippy::cast_sign_loss,
123    clippy::cast_possible_wrap,
124    clippy::cast_precision_loss,
125    clippy::must_use_candidate,
126    clippy::missing_errors_doc,
127    clippy::missing_panics_doc,
128
129    // Lexer-specific patterns that are fine
130    clippy::match_same_arms,
131    clippy::redundant_else,
132    clippy::unnecessary_wraps,
133    clippy::unused_self,
134    clippy::items_after_statements,
135    clippy::struct_excessive_bools,
136    clippy::uninlined_format_args
137)]
138
139use std::sync::{Arc, OnceLock};
140
141pub mod api;
142pub mod builtins;
143pub mod checkpoint;
144pub mod config;
145pub mod error;
146mod heredoc;
147pub mod keywords;
148pub mod limits;
149pub mod mode;
150mod quote_handler;
151pub mod token;
152pub mod tokenizer;
153mod unicode;
154
155pub use api::*;
156pub use checkpoint::{CheckpointCache, Checkpointable, LexerCheckpoint};
157pub use config::LexerConfig;
158pub use error::{LexerError, Result};
159pub use limits::MAX_REGEX_PARSE_STEPS;
160pub use mode::LexerMode;
161pub use perl_position_tracking::Position;
162pub use token::{StringPart, Token, TokenType};
163
164use unicode::{is_perl_identifier_continue, is_perl_identifier_start};
165
166use crate::heredoc::HeredocSpec;
167use crate::limits::{
168    HEREDOC_TIMEOUT_MS, MAX_DELIM_NEST, MAX_HEREDOC_BYTES, MAX_HEREDOC_DEPTH, MAX_REGEX_BYTES,
169};
170
171/// Context-aware Perl lexer that produces a token stream from source text.
172///
173/// The lexer tracks an internal [`LexerMode`] to disambiguate context-sensitive
174/// syntax (e.g., `/` as division vs. regex delimiter). Construct with
175/// [`PerlLexer::new`] and call [`PerlLexer::next_token`] or
176/// [`PerlLexer::collect_tokens`] to consume the stream.
177///
178/// # Examples
179///
180/// ```rust
181/// use perl_lexer::{PerlLexer, TokenType};
182///
183/// let mut lexer = PerlLexer::new("my $x = 42;");
184/// let tokens = lexer.collect_tokens();
185/// assert!(!tokens.is_empty());
186/// ```
187pub struct PerlLexer<'a> {
188    input: &'a str,
189    /// Cached input bytes for faster access
190    input_bytes: &'a [u8],
191    position: usize,
192    mode: LexerMode,
193    config: LexerConfig,
194    /// Stack for nested delimiters in s{}{} constructs
195    delimiter_stack: Vec<char>,
196    /// Track if we're inside prototype parens after 'sub'
197    in_prototype: bool,
198    /// Paren depth to track when we exit prototype
199    prototype_depth: usize,
200    /// Track if we just saw a 'sub' keyword (waiting for possible prototype)
201    after_sub: bool,
202    /// Track if we just saw a '->' operator (to suppress s/tr/y as substitution)
203    after_arrow: bool,
204    /// Depth of hash-subscript brace nesting.
205    /// When > 0, suppresses quote-op detection so `m`, `s`, `q*`, `tr`, `y`
206    /// are treated as bareword identifiers (hash keys) rather than regex operators.
207    /// Depth tracking means all positions inside `$h{...}` — including after commas
208    /// in hash slices like `@h{m, s}` — correctly suppress quote-op misidentification.
209    hash_brace_depth: usize,
210    /// Set to `true` immediately after emitting a complete `$var`, `@var`, or `%var`
211    /// token (not bare sigils used for dereference). Cleared by any operator,
212    /// punctuation, or keyword token. The `{` handler increments `hash_brace_depth`
213    /// only when this flag is set, ensuring only genuine hash/slice subscripts
214    /// (e.g. `$h{m}`, `@h{s, tr}`) suppress quote-op detection — not block-opening
215    /// braces after `sub foo`, `if (cond)`, `else`, `while (cond)`, etc.
216    after_var_subscript: bool,
217    /// Depth of open parentheses — used to distinguish `(1<<func())` (bitshift)
218    /// from `print $fh <<END` (heredoc at statement level, paren_depth == 0).
219    paren_depth: usize,
220    /// Current position with line/column tracking
221    #[allow(dead_code)]
222    current_pos: Position,
223    /// Track if we just skipped a newline (for __DATA__/__END__ detection)
224    after_newline: bool,
225    /// Queue of pending heredocs waiting for their bodies
226    pending_heredocs: Vec<HeredocSpec>,
227    /// Track the byte offset of the current line's start
228    line_start_offset: usize,
229    /// If true, emit `HeredocBody` tokens; otherwise just consume them.
230    emit_heredoc_body_tokens: bool,
231    /// Current quote operator being parsed
232    current_quote_op: Option<quote_handler::QuoteOperatorInfo>,
233    /// Track if EOF has been emitted to prevent infinite loops
234    eof_emitted: bool,
235    /// Start time for timeout protection
236    start_time: std::time::Instant,
237}
238
239impl<'a> PerlLexer<'a> {
240    /// Create a new lexer for the given input
241    pub fn new(input: &'a str) -> Self {
242        Self::with_config(input, LexerConfig::default())
243    }
244
245    /// Create a new lexer with custom configuration
246    pub fn with_config(input: &'a str, config: LexerConfig) -> Self {
247        Self {
248            input,
249            input_bytes: input.as_bytes(),
250            position: 0,
251            mode: LexerMode::ExpectTerm,
252            config,
253            delimiter_stack: Vec::new(),
254            in_prototype: false,
255            prototype_depth: 0,
256            after_sub: false,
257            after_arrow: false,
258            hash_brace_depth: 0,
259            after_var_subscript: false,
260            paren_depth: 0,
261            current_pos: Position::start(),
262            after_newline: true, // Start of file counts as after newline
263            pending_heredocs: Vec::new(),
264            line_start_offset: 0,
265            emit_heredoc_body_tokens: false,
266            current_quote_op: None,
267            eof_emitted: false,
268            start_time: std::time::Instant::now(),
269        }
270    }
271
272    /// Create a new lexer that emits `HeredocBody` tokens (for LSP folding)
273    pub fn with_body_tokens(input: &'a str) -> Self {
274        let mut lexer = Self::new(input);
275        lexer.emit_heredoc_body_tokens = true;
276        lexer
277    }
278
279    /// Normalize file start by skipping BOM if present
280    fn normalize_file_start(&mut self) {
281        // Skip UTF-8 BOM (EF BB BF) if at file start
282        if self.position == 0 && self.matches_bytes(&[0xEF, 0xBB, 0xBF]) {
283            self.position = 3;
284            self.line_start_offset = 3;
285        }
286    }
287
288    /// Set the lexer mode (for resetting state at statement boundaries)
289    pub fn set_mode(&mut self, mode: LexerMode) {
290        self.mode = mode;
291    }
292
293    /// Helper to check if remaining bytes on a line are only spaces/tabs
294    #[inline]
295    fn trailing_ws_only(bytes: &[u8], mut p: usize) -> bool {
296        while p < bytes.len() && bytes[p] != b'\n' && bytes[p] != b'\r' {
297            match bytes[p] {
298                b' ' | b'\t' => p += 1,
299                _ => return false,
300            }
301        }
302        true
303    }
304
305    /// Consume a newline sequence (CRLF or LF) and update state
306    #[inline]
307    fn consume_newline(&mut self) {
308        if self.position >= self.input.len() {
309            return;
310        }
311        match self.input_bytes[self.position] {
312            b'\r' => {
313                self.position += 1;
314                if self.position < self.input.len() && self.input_bytes[self.position] == b'\n' {
315                    self.position += 1;
316                }
317            }
318            b'\n' => self.advance(),
319            _ => return, // not at a newline
320        }
321        self.after_newline = true;
322        self.line_start_offset = self.position;
323    }
324
325    /// Find the end of the current line, returning both raw end and visible end (without trailing CR)
326    #[inline]
327    fn find_line_end(bytes: &[u8], start: usize) -> (usize, usize) {
328        let mut end = start;
329        while end < bytes.len() && bytes[end] != b'\n' && bytes[end] != b'\r' {
330            end += 1;
331        }
332        let visible_end = end;
333        (end, visible_end)
334    }
335
336    #[inline]
337    fn parse_quoted_heredoc_delimiter(&mut self, quote: char, text: &mut String) -> Option<String> {
338        text.push(quote);
339        self.advance();
340
341        let mut delim = String::new();
342        while self.position < self.input.len() {
343            let Some(ch) = self.current_char() else {
344                break;
345            };
346
347            if ch == quote {
348                text.push(ch);
349                self.advance();
350                return Some(delim);
351            }
352
353            // Delimiter quoting cannot span a line. If we hit CR/LF before the
354            // closing quote, this is not a valid heredoc opener.
355            if ch == '\n' || ch == '\r' {
356                return None;
357            }
358
359            delim.push(ch);
360            text.push(ch);
361            self.advance();
362        }
363
364        // Unterminated quoted delimiter: degrade gracefully by treating this as
365        // not-a-heredoc so normal tokenization can continue.
366        None
367    }
368
369    /// Advance the lexer and return the next token.
370    ///
371    /// Returns `None` only after an `EOF` token has already been emitted.
372    /// The final meaningful call returns `Some(Token { token_type: TokenType::EOF, .. })`.
373    pub fn next_token(&mut self) -> Option<Token> {
374        // Normalize file start (BOM) once
375        if self.position == 0 {
376            self.normalize_file_start();
377        }
378
379        // Loop to avoid recursion when processing heredocs
380        loop {
381            // Handle format body parsing if we're in that mode
382            if matches!(self.mode, LexerMode::InFormatBody) {
383                return self.parse_format_body();
384            }
385
386            // Handle data section parsing if we're in that mode
387            if matches!(self.mode, LexerMode::InDataSection) {
388                return self.parse_data_body();
389            }
390
391            // Check if we're inside a heredoc body BEFORE skipping whitespace
392            let mut found_terminator = false;
393            if !self.pending_heredocs.is_empty() {
394                // Clone what we need to avoid holding a borrow
395                let (body_start, label, allow_indent) =
396                    if let Some(spec) = self.pending_heredocs.first() {
397                        if spec.body_start > 0
398                            && self.position >= spec.body_start
399                            && self.position < self.input.len()
400                        {
401                            (spec.body_start, spec.label.clone(), spec.allow_indent)
402                        } else {
403                            // Not in a heredoc body yet or at EOF
404                            (0, empty_arc(), false)
405                        }
406                    } else {
407                        (0, empty_arc(), false)
408                    };
409
410                if body_start > 0 {
411                    // We're inside a heredoc body - scan for the terminator
412
413                    // Scan line by line looking for the terminator
414                    while self.position < self.input.len() {
415                        // Timeout protection (Issue #443)
416                        if self.start_time.elapsed().as_millis() > HEREDOC_TIMEOUT_MS as u128 {
417                            self.pending_heredocs.remove(0);
418                            self.position = self.input.len();
419                            return Some(Token {
420                                token_type: TokenType::Error(Arc::from("Heredoc parsing timeout")),
421                                text: Arc::from(&self.input[body_start..]),
422                                start: body_start,
423                                end: self.input.len(),
424                            });
425                        }
426
427                        // Budget cap for huge bodies - optimized check
428                        if self.position - body_start > MAX_HEREDOC_BYTES {
429                            // Remove the pending heredoc to avoid infinite loop
430                            self.pending_heredocs.remove(0);
431                            self.position = self.input.len();
432                            return Some(Token {
433                                token_type: TokenType::UnknownRest,
434                                text: Arc::from(&self.input[body_start..]),
435                                start: body_start,
436                                end: self.input.len(),
437                            });
438                        }
439
440                        // Skip to start of next line if not at line start
441                        // Exception: if we're at body_start exactly, we're at the heredoc body start
442                        if !self.after_newline && self.position != body_start {
443                            while self.position < self.input.len()
444                                && self.input_bytes[self.position] != b'\n'
445                                && self.input_bytes[self.position] != b'\r'
446                            {
447                                self.advance();
448                            }
449                            self.consume_newline();
450                            continue;
451                        }
452
453                        // We're at line start - check if this line is the terminator
454                        let line_start = self.position;
455                        let (line_end, line_visible_end) =
456                            Self::find_line_end(self.input_bytes, self.position);
457                        let line = &self.input[line_start..line_visible_end];
458                        // Strip trailing spaces/tabs (Perl allows them)
459                        let trimmed_end = line.trim_end_matches([' ', '\t']);
460
461                        // Check if this line is the terminator
462                        let is_terminator = if allow_indent {
463                            // Allow any leading spaces/tabs before the label
464                            let mut p = 0;
465                            while p < trimmed_end.len() {
466                                let b = trimmed_end.as_bytes()[p];
467                                if b == b' ' || b == b'\t' {
468                                    p += 1;
469                                } else {
470                                    break;
471                                }
472                            }
473                            trimmed_end[p..] == *label
474                        } else {
475                            // Must start at column 0 (no leading whitespace)
476                            // The terminator is just the label (already trimmed trailing whitespace)
477                            trimmed_end == &*label
478                        };
479
480                        if is_terminator {
481                            // Found the terminator!
482                            self.pending_heredocs.remove(0);
483                            found_terminator = true;
484
485                            // Consume past the terminator line
486                            self.position = line_end;
487                            self.consume_newline();
488
489                            // Set body_start for the next pending heredoc (if any)
490                            if let Some(next) = self.pending_heredocs.first_mut()
491                                && next.body_start == 0
492                            {
493                                next.body_start = self.position;
494                            }
495
496                            // Only emit HeredocBody if requested (for folding)
497                            if self.emit_heredoc_body_tokens {
498                                return Some(Token {
499                                    token_type: TokenType::HeredocBody(empty_arc()),
500                                    text: empty_arc(),
501                                    start: body_start,
502                                    end: line_start,
503                                });
504                            }
505                            // Otherwise, continue the outer loop to get the next real token (avoiding recursion)
506                            break; // Break inner while loop, continue outer loop
507                        }
508
509                        // Not the terminator, continue to next line
510                        self.position = line_end;
511                        self.consume_newline();
512                    }
513
514                    // If we didn't find a terminator, we reached EOF - emit error token
515                    if !found_terminator {
516                        // Remove the pending heredoc to avoid infinite loop
517                        self.pending_heredocs.remove(0);
518                        self.position = self.input.len();
519                        return Some(Token {
520                            token_type: TokenType::UnknownRest,
521                            text: Arc::from(&self.input[body_start..]),
522                            start: body_start,
523                            end: self.input.len(),
524                        });
525                    }
526                }
527
528                // If we found a terminator, continue outer loop to get next token
529                if found_terminator {
530                    continue; // Continue outer loop to get next token
531                }
532            }
533
534            self.skip_whitespace_and_comments()?;
535
536            // Check again if we're now in a heredoc body (might have been set during skip_whitespace)
537            if !self.pending_heredocs.is_empty()
538                && let Some(spec) = self.pending_heredocs.first()
539                && spec.body_start > 0
540                && self.position >= spec.body_start
541                && self.position < self.input.len()
542            {
543                continue; // Go back to top of loop to process heredoc
544            }
545
546            // If we reach EOF with pending heredocs, clear them and emit EOF
547            if self.position >= self.input.len() && !self.pending_heredocs.is_empty() {
548                self.pending_heredocs.clear();
549            }
550
551            if self.position >= self.input.len() {
552                if self.eof_emitted {
553                    return None; // Stop the stream
554                }
555                self.eof_emitted = true;
556                return Some(Token {
557                    token_type: TokenType::EOF,
558                    text: empty_arc(),
559                    start: self.position,
560                    end: self.position,
561                });
562            }
563
564            let start = self.position;
565
566            // Check for special tokens first
567            if let Some(token) = self.try_heredoc() {
568                return Some(token);
569            }
570
571            if let Some(token) = self.try_string() {
572                return Some(token);
573            }
574
575            if let Some(token) = self.try_variable() {
576                return Some(token);
577            }
578
579            if let Some(token) = self.try_number() {
580                return Some(token);
581            }
582
583            if let Some(token) = self.try_vstring() {
584                return Some(token);
585            }
586
587            if let Some(token) = self.try_identifier_or_keyword() {
588                return Some(token);
589            }
590
591            // If we're expecting a delimiter for a quote operator, only try delimiter
592            if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
593                if let Some(token) = self.try_delimiter() {
594                    return Some(token);
595                }
596                // Do NOT fall through to try_operator / try_punct / etc.
597                // Clear state first so we don't spin
598                self.mode = LexerMode::ExpectOperator;
599                self.current_quote_op = None;
600                continue;
601            }
602
603            if let Some(token) = self.try_operator() {
604                return Some(token);
605            }
606
607            if let Some(token) = self.try_delimiter() {
608                return Some(token);
609            }
610
611            // If nothing else matches, return an error token
612            let ch = self.current_char()?;
613            self.advance();
614
615            // Optimize error token creation - avoid expensive formatting in hot path
616            let text = if ch.is_ascii() {
617                // Fast path for ASCII characters
618                Arc::from(&self.input[start..self.position])
619            } else {
620                // Unicode path without intermediate heap allocation
621                let mut buf = [0_u8; 4];
622                Arc::from(ch.encode_utf8(&mut buf))
623            };
624
625            return Some(Token {
626                token_type: TokenType::Error(Arc::from("Unexpected character")),
627                text,
628                start,
629                end: self.position,
630            });
631        } // End of loop
632    }
633
634    /// Budget guard to prevent infinite loops and timeouts (Issue #422)
635    ///
636    /// **Purpose**: Protect against pathological input that could cause:
637    /// - Infinite loops in regex/heredoc parsing
638    /// - Excessive memory consumption
639    /// - LSP server hangs
640    ///
641    /// **Limits**:
642    /// - `MAX_REGEX_BYTES` (64KB): Maximum bytes in a single regex literal
643    /// - `MAX_DELIM_NEST` (128): Maximum delimiter nesting depth
644    ///
645    /// **Graceful Degradation**:
646    /// - Budget exceeded → emit `UnknownRest` token
647    /// - Jump to EOF to prevent further parsing of problematic region
648    /// - LSP client can emit soft diagnostic about truncation
649    /// - All previously parsed symbols remain valid
650    ///
651    /// **Performance**:
652    /// - Fast path: inlined subtraction + comparison (~1-2 CPU cycles)
653    /// - Slow path: Only triggered on pathological input
654    /// - Amortized cost: O(1) per token
655    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
656    #[inline(always)]
657    fn budget_guard(&mut self, start: usize, depth: usize) -> Option<Token> {
658        // Fast path: most calls won't hit limits
659        let bytes_consumed = self.position - start;
660        if bytes_consumed <= MAX_REGEX_BYTES && depth <= MAX_DELIM_NEST {
661            return None;
662        }
663
664        // Slow path: budget exceeded - graceful degradation
665        #[cfg(debug_assertions)]
666        {
667            tracing::debug!(
668                bytes_consumed,
669                depth,
670                position = self.position,
671                "Lexer budget exceeded"
672            );
673        }
674
675        self.position = self.input.len();
676        Some(Token {
677            token_type: TokenType::UnknownRest,
678            text: Arc::from(""),
679            start,
680            end: self.position,
681        })
682    }
683
684    /// Peek at the next token without consuming it.
685    ///
686    /// Saves and restores the full lexer state so the next call to
687    /// [`next_token`](Self::next_token) returns the same token.
688    pub fn peek_token(&mut self) -> Option<Token> {
689        let saved_pos = self.position;
690        let saved_mode = self.mode;
691        let saved_delimiter_stack = self.delimiter_stack.clone();
692        let saved_prototype = self.in_prototype;
693        let saved_depth = self.prototype_depth;
694        let saved_after_sub = self.after_sub;
695        let saved_after_arrow = self.after_arrow;
696        let saved_hash_brace_depth = self.hash_brace_depth;
697        let saved_after_var_subscript = self.after_var_subscript;
698        let saved_paren_depth = self.paren_depth;
699        let saved_current_pos = self.current_pos;
700        let saved_after_newline = self.after_newline;
701        let saved_pending_heredocs = self.pending_heredocs.clone();
702        let saved_line_start_offset = self.line_start_offset;
703        let saved_current_quote_op = self.current_quote_op.clone();
704        let saved_eof_emitted = self.eof_emitted;
705        let saved_start_time = self.start_time;
706
707        let token = self.next_token();
708
709        self.position = saved_pos;
710        self.mode = saved_mode;
711        self.delimiter_stack = saved_delimiter_stack;
712        self.in_prototype = saved_prototype;
713        self.prototype_depth = saved_depth;
714        self.after_sub = saved_after_sub;
715        self.after_arrow = saved_after_arrow;
716        self.hash_brace_depth = saved_hash_brace_depth;
717        self.after_var_subscript = saved_after_var_subscript;
718        self.paren_depth = saved_paren_depth;
719        self.current_pos = saved_current_pos;
720        self.after_newline = saved_after_newline;
721        self.pending_heredocs = saved_pending_heredocs;
722        self.line_start_offset = saved_line_start_offset;
723        self.current_quote_op = saved_current_quote_op;
724        self.eof_emitted = saved_eof_emitted;
725        self.start_time = saved_start_time;
726
727        token
728    }
729
730    /// Consume all remaining tokens and return them as a vector.
731    ///
732    /// The returned vector always ends with an `EOF` token.
733    pub fn collect_tokens(&mut self) -> Vec<Token> {
734        let mut tokens = Vec::new();
735        while let Some(token) = self.next_token() {
736            if token.token_type == TokenType::EOF {
737                tokens.push(token);
738                break;
739            }
740            tokens.push(token);
741        }
742        tokens
743    }
744
745    /// Reset the lexer to the beginning of the input.
746    ///
747    /// Clears all internal state (mode, delimiter stack, heredoc queue, etc.)
748    /// so the lexer can re-tokenize the same source from scratch.
749    pub fn reset(&mut self) {
750        self.position = 0;
751        self.mode = LexerMode::ExpectTerm;
752        self.delimiter_stack.clear();
753        self.in_prototype = false;
754        self.prototype_depth = 0;
755        self.after_sub = false;
756        self.after_arrow = false;
757        self.hash_brace_depth = 0;
758        self.after_var_subscript = false;
759        self.paren_depth = 0;
760        self.current_pos = Position::start();
761        self.after_newline = true;
762        self.pending_heredocs.clear();
763        self.line_start_offset = 0;
764        self.current_quote_op = None;
765        self.eof_emitted = false;
766        self.start_time = std::time::Instant::now();
767    }
768
769    /// Switch the lexer into format-body parsing mode.
770    ///
771    /// In this mode the lexer consumes input verbatim until it encounters a
772    /// line containing only `.` (the Perl format terminator).
773    pub fn enter_format_mode(&mut self) {
774        self.mode = LexerMode::InFormatBody;
775    }
776
777    // Internal helper methods
778
779    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
780    #[inline(always)]
781    fn byte_at(bytes: &[u8], index: usize) -> u8 {
782        debug_assert!(index < bytes.len());
783        match bytes.get(index) {
784            Some(&byte) => byte,
785            None => 0,
786        }
787    }
788
789    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
790    #[inline(always)]
791    fn current_char(&self) -> Option<char> {
792        if self.position < self.input_bytes.len() {
793            // For ASCII, direct access is safe
794            let byte = Self::byte_at(self.input_bytes, self.position);
795            if byte < 128 {
796                Some(byte as char)
797            } else {
798                // For non-ASCII, fall back to proper UTF-8 parsing
799                self.input.get(self.position..).and_then(|s| s.chars().next())
800            }
801        } else {
802            None
803        }
804    }
805
806    #[inline(always)]
807    fn peek_char(&self, offset: usize) -> Option<char> {
808        if offset > self.config.max_lookahead {
809            return None;
810        }
811
812        let pos = self.position.checked_add(offset)?;
813        if pos < self.input_bytes.len() {
814            // For ASCII, direct access is safe
815            let byte = Self::byte_at(self.input_bytes, pos);
816            if byte < 128 {
817                Some(byte as char)
818            } else {
819                // For non-ASCII, use chars iterator
820                self.input.get(self.position..).and_then(|s| s.chars().nth(offset))
821            }
822        } else {
823            None
824        }
825    }
826
827    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
828    #[inline(always)]
829    fn advance(&mut self) {
830        if self.position < self.input_bytes.len() {
831            let byte = Self::byte_at(self.input_bytes, self.position);
832            if byte < 128 {
833                // ASCII fast path
834                self.position += 1;
835            } else if let Some(ch) = self.input.get(self.position..).and_then(|s| s.chars().next())
836            {
837                self.position += ch.len_utf8();
838            }
839        }
840    }
841
842    /// General-purpose balanced-segment consumer (no quote-boundary recovery).
843    ///
844    /// For use inside double-quoted string interpolation where the outer `"` must
845    /// act as a recovery boundary, use [`consume_balanced_segment_in_string`] instead.
846    #[allow(dead_code)]
847    #[inline]
848    fn consume_balanced_segment(&mut self, open: char, close: char) -> Option<usize> {
849        if self.current_char() != Some(open) {
850            return None;
851        }
852
853        let mut depth = 1usize;
854        self.advance();
855        while let Some(ch) = self.current_char() {
856            match ch {
857                '\\' => {
858                    self.advance();
859                    if self.current_char().is_some() {
860                        self.advance();
861                    }
862                }
863                c if c == open => {
864                    depth += 1;
865                    self.advance();
866                }
867                c if c == close => {
868                    self.advance();
869                    depth -= 1;
870                    if depth == 0 {
871                        return Some(self.position);
872                    }
873                }
874                _ => self.advance(),
875            }
876        }
877
878        None
879    }
880
881    #[inline]
882    fn consume_balanced_segment_in_string(
883        &mut self,
884        open: char,
885        close: char,
886        terminator: char,
887    ) -> Option<usize> {
888        if self.current_char() != Some(open) {
889            return None;
890        }
891
892        let mut depth = 1usize;
893        self.advance();
894        while let Some(ch) = self.current_char() {
895            match ch {
896                '\\' => {
897                    self.advance();
898                    if self.current_char().is_some() {
899                        self.advance();
900                    }
901                }
902                c if c == terminator => {
903                    // Local recovery for interpolation tails in quoted strings:
904                    // stop at the closing quote so the outer string parser can
905                    // still terminate this token cleanly.
906                    return None;
907                }
908                c if c == open => {
909                    depth += 1;
910                    self.advance();
911                }
912                c if c == close => {
913                    self.advance();
914                    depth -= 1;
915                    if depth == 0 {
916                        return Some(self.position);
917                    }
918                }
919                _ => self.advance(),
920            }
921        }
922
923        None
924    }
925
926    /// Fast byte-level check for ASCII characters
927    #[inline]
928    fn peek_byte(&self, offset: usize) -> Option<u8> {
929        if offset > self.config.max_lookahead {
930            return None;
931        }
932
933        let pos = self.position.checked_add(offset)?;
934        if pos < self.input_bytes.len() { Some(self.input_bytes[pos]) } else { None }
935    }
936
937    /// Check if the next bytes match a pattern (ASCII only)
938    #[inline]
939    fn matches_bytes(&self, pattern: &[u8]) -> bool {
940        let Some(end_offset) = pattern.len().checked_sub(1) else {
941            return true;
942        };
943
944        if end_offset > self.config.max_lookahead {
945            return false;
946        }
947
948        let Some(end) = self.position.checked_add(pattern.len()) else {
949            return false;
950        };
951
952        if end <= self.input_bytes.len() {
953            &self.input_bytes[self.position..end] == pattern
954        } else {
955            false
956        }
957    }
958
959    #[inline]
960    fn skip_whitespace_and_comments(&mut self) -> Option<()> {
961        // Don't reset after_newline if we're at the start of a line
962        if self.position > 0 && self.position != self.line_start_offset {
963            self.after_newline = false;
964        }
965
966        while self.position < self.input_bytes.len() {
967            let byte = Self::byte_at(self.input_bytes, self.position);
968            match byte {
969                // Fast path for ASCII whitespace - batch process
970                b' ' => {
971                    // Batch skip spaces for better cache efficiency
972                    let start = self.position;
973                    while self.position < self.input_bytes.len()
974                        && Self::byte_at(self.input_bytes, self.position) == b' '
975                    {
976                        self.position += 1;
977                    }
978                    // Continue outer loop if we processed any spaces
979                    if self.position > start {
980                        // Loop naturally continues to next iteration
981                    }
982                }
983                b'\t' | 0x0B | 0x0C => {
984                    // Batch skip horizontal tab, vertical tab, and form feed.
985                    // Perl treats these as whitespace separators.
986                    let start = self.position;
987                    while self.position < self.input_bytes.len()
988                        && matches!(
989                            Self::byte_at(self.input_bytes, self.position),
990                            b'\t' | 0x0B | 0x0C
991                        )
992                    {
993                        self.position += 1;
994                    }
995                    if self.position > start {
996                        // Loop naturally continues to next iteration
997                    }
998                }
999                b'\r' | b'\n' => {
1000                    self.consume_newline();
1001
1002                    // Set body_start for the FIRST pending heredoc that needs it (FIFO)
1003                    // Only check if we have pending heredocs to avoid unnecessary work
1004                    if !self.pending_heredocs.is_empty() {
1005                        for spec in &mut self.pending_heredocs {
1006                            if spec.body_start == 0 {
1007                                spec.body_start = self.position;
1008                                break; // Only set for the first unresolved heredoc
1009                            }
1010                        }
1011                    }
1012                }
1013                b'#' => {
1014                    // In ExpectDelimiter mode, '#' is a delimiter, not a comment
1015                    if matches!(self.mode, LexerMode::ExpectDelimiter) {
1016                        break;
1017                    }
1018
1019                    // Skip line comment using memchr for fast newline search
1020                    self.position += 1; // Skip # directly
1021
1022                    // Use memchr2 to find CR/LF line endings quickly (supports LF, CRLF, and CR)
1023                    if let Some(newline_offset) =
1024                        memchr::memchr2(b'\n', b'\r', &self.input_bytes[self.position..])
1025                    {
1026                        self.position += newline_offset;
1027                    } else {
1028                        // No newline found, skip to end
1029                        self.position = self.input_bytes.len();
1030                    }
1031                }
1032                b'=' if self.position == 0
1033                    || (self.position > 0
1034                        && matches!(self.input_bytes[self.position - 1], b'\n' | b'\r')) =>
1035                {
1036                    // Check if this starts a POD section (=pod, =head, =over, etc.)
1037                    // Use byte-safe checks — avoid slicing &str at arbitrary byte positions
1038                    let remaining = &self.input_bytes[self.position..];
1039                    if remaining.starts_with(b"=pod")
1040                        || remaining.starts_with(b"=head")
1041                        || remaining.starts_with(b"=over")
1042                        || remaining.starts_with(b"=item")
1043                        || remaining.starts_with(b"=back")
1044                        || remaining.starts_with(b"=begin")
1045                        || remaining.starts_with(b"=end")
1046                        || remaining.starts_with(b"=for")
1047                        || remaining.starts_with(b"=encoding")
1048                    {
1049                        // Scan forward for \n=cut (end of POD block)
1050                        let search_start = self.position;
1051                        let mut found_cut = false;
1052                        let bytes = self.input_bytes;
1053                        let mut i = search_start;
1054                        while i < bytes.len() {
1055                            // Look for =cut at the start of a line
1056                            if (i == 0 || matches!(bytes[i - 1], b'\n' | b'\r'))
1057                                && bytes[i..].starts_with(b"=cut")
1058                            {
1059                                i += 4; // Skip "=cut"
1060                                // Skip rest of the =cut line
1061                                while i < bytes.len() && bytes[i] != b'\n' && bytes[i] != b'\r' {
1062                                    i += 1;
1063                                }
1064                                // Consume one line ending sequence if present
1065                                if i < bytes.len() && bytes[i] == b'\r' {
1066                                    i += 1;
1067                                    if i < bytes.len() && bytes[i] == b'\n' {
1068                                        i += 1;
1069                                    }
1070                                } else if i < bytes.len() && bytes[i] == b'\n' {
1071                                    i += 1;
1072                                }
1073                                self.position = i;
1074                                found_cut = true;
1075                                break;
1076                            }
1077                            i += 1;
1078                        }
1079                        if !found_cut {
1080                            // POD extends to end of file
1081                            self.position = bytes.len();
1082                        }
1083                        continue;
1084                    }
1085                    // Not a POD directive - regular '=' token
1086                    break;
1087                }
1088                _ => {
1089                    // For non-ASCII whitespace, use char check only when needed
1090                    if byte >= 128
1091                        && let Some(ch) = self.current_char()
1092                        && ch.is_whitespace()
1093                    {
1094                        self.advance();
1095                        continue;
1096                    }
1097                    break;
1098                }
1099            }
1100        }
1101        Some(())
1102    }
1103
1104    fn try_heredoc(&mut self) -> Option<Token> {
1105        // `<<` is the left-shift operator, not a heredoc, when we are inside
1106        // a parenthesized expression and have just finished a term.
1107        // E.g. `(1<<index(...))` — the `1` sets ExpectOperator and paren_depth > 0,
1108        // so `<<index` must be the bitshift operator, not a heredoc start.
1109        //
1110        // We must NOT fire the guard at statement level (paren_depth == 0) because
1111        // `print $fh <<END` is valid Perl: `$fh` sets ExpectOperator but `<<END`
1112        // is a heredoc.  The depth check distinguishes the two cases.
1113        if self.mode == LexerMode::ExpectOperator && self.paren_depth > 0 {
1114            return None;
1115        }
1116
1117        // Check for heredoc start
1118        if self.peek_byte(0) != Some(b'<') || self.peek_byte(1) != Some(b'<') {
1119            return None;
1120        }
1121
1122        let start = self.position;
1123        let mut text = String::from("<<");
1124        self.position += 2; // Skip <<
1125
1126        // Check for indented heredoc (~)
1127        let allow_indent = if self.current_char() == Some('~') {
1128            text.push('~');
1129            self.advance();
1130            true
1131        } else {
1132            false
1133        };
1134
1135        // Skip whitespace
1136        while let Some(ch) = self.current_char() {
1137            if ch == ' ' || ch == '\t' {
1138                text.push(ch);
1139                self.advance();
1140            } else {
1141                break;
1142            }
1143        }
1144
1145        // Optional backslash disables interpolation, treat like single-quoted label
1146        let backslashed = if self.current_char() == Some('\\') {
1147            text.push('\\');
1148            self.advance();
1149            true
1150        } else {
1151            false
1152        };
1153
1154        // Parse delimiter
1155        let delimiter = if self.position < self.input.len() {
1156            match self.current_char() {
1157                Some('"') if !backslashed => self.parse_quoted_heredoc_delimiter('"', &mut text)?,
1158                Some('\'') if !backslashed => {
1159                    self.parse_quoted_heredoc_delimiter('\'', &mut text)?
1160                }
1161                Some('`') if !backslashed => self.parse_quoted_heredoc_delimiter('`', &mut text)?,
1162                Some(c) if is_perl_identifier_start(c) => {
1163                    // Bare word delimiter
1164                    let mut delim = String::new();
1165                    while self.position < self.input.len() {
1166                        if let Some(c) = self.current_char() {
1167                            if is_perl_identifier_continue(c) {
1168                                delim.push(c);
1169                                text.push(c);
1170                                self.advance();
1171                            } else {
1172                                break;
1173                            }
1174                        } else {
1175                            break;
1176                        }
1177                    }
1178                    delim
1179                }
1180                _ => {
1181                    // Not a valid heredoc delimiter - reset position and return None
1182                    // This allows << to be parsed as bitshift operator (e.g., 1 << 2)
1183                    self.position = start;
1184                    return None;
1185                }
1186            }
1187        } else {
1188            // No delimiter found - reset position and return None
1189            self.position = start;
1190            return None;
1191        };
1192
1193        // For now, return a placeholder token
1194        // The actual heredoc body would be parsed later when we encounter it
1195        self.mode = LexerMode::ExpectOperator;
1196
1197        // Recursion depth limit (Issue #443)
1198        if self.pending_heredocs.len() >= MAX_HEREDOC_DEPTH {
1199            return Some(Token {
1200                token_type: TokenType::Error(Arc::from("Heredoc nesting too deep")),
1201                text: Arc::from(text),
1202                start,
1203                end: self.position,
1204            });
1205        }
1206
1207        // Queue the heredoc spec with its label
1208        self.pending_heredocs.push(HeredocSpec {
1209            label: Arc::from(delimiter.as_str()),
1210            body_start: 0, // Will be set when we see the newline after this line
1211            allow_indent,
1212        });
1213
1214        Some(Token {
1215            token_type: TokenType::HeredocStart,
1216            text: Arc::from(text),
1217            start,
1218            end: self.position,
1219        })
1220    }
1221
1222    fn try_string(&mut self) -> Option<Token> {
1223        let start = self.position;
1224        let quote = self.current_char()?;
1225
1226        match quote {
1227            '"' => self.parse_double_quoted_string(start),
1228            '\'' => self.parse_single_quoted_string(start),
1229            '`' => self.parse_backtick_string(start),
1230            'q' if self.peek_char(1) == Some('{') => self.parse_q_string(start),
1231            _ => None,
1232        }
1233    }
1234
1235    #[inline]
1236    fn try_number(&mut self) -> Option<Token> {
1237        let start = self.position;
1238
1239        // Fast byte check for digits - optimized bounds checking
1240        let bytes = self.input_bytes;
1241        if self.position >= bytes.len() || !Self::byte_at(bytes, self.position).is_ascii_digit() {
1242            return None;
1243        }
1244
1245        // Check for hex (0x), binary (0b), or octal (0o) prefixes
1246        let mut pos = self.position;
1247        if Self::byte_at(bytes, pos) == b'0' && pos + 1 < bytes.len() {
1248            let prefix_byte = bytes[pos + 1];
1249            if prefix_byte == b'x' || prefix_byte == b'X' {
1250                // Hexadecimal: 0x[0-9a-fA-F_]+
1251                pos += 2; // consume '0x'
1252                let digit_start = pos;
1253                let mut saw_digit = false;
1254                while pos < bytes.len() && (bytes[pos].is_ascii_hexdigit() || bytes[pos] == b'_') {
1255                    saw_digit |= bytes[pos].is_ascii_hexdigit();
1256                    pos += 1;
1257                }
1258                if pos > digit_start && saw_digit {
1259                    self.position = pos;
1260                    let text = &self.input[start..self.position];
1261                    self.mode = LexerMode::ExpectOperator;
1262                    return Some(Token {
1263                        token_type: TokenType::Number(Arc::from(text)),
1264                        text: Arc::from(text),
1265                        start,
1266                        end: self.position,
1267                    });
1268                }
1269                // No hex digits after 0x - fall through to parse '0' as decimal
1270            } else if prefix_byte == b'b' || prefix_byte == b'B' {
1271                // Binary: 0b[01_]+
1272                pos += 2; // consume '0b'
1273                let digit_start = pos;
1274                let mut saw_digit = false;
1275                while pos < bytes.len()
1276                    && (bytes[pos] == b'0' || bytes[pos] == b'1' || bytes[pos] == b'_')
1277                {
1278                    saw_digit |= bytes[pos] == b'0' || bytes[pos] == b'1';
1279                    pos += 1;
1280                }
1281                if pos > digit_start && saw_digit {
1282                    self.position = pos;
1283                    let text = &self.input[start..self.position];
1284                    self.mode = LexerMode::ExpectOperator;
1285                    return Some(Token {
1286                        token_type: TokenType::Number(Arc::from(text)),
1287                        text: Arc::from(text),
1288                        start,
1289                        end: self.position,
1290                    });
1291                }
1292                // No binary digits after 0b - fall through to parse '0' as decimal
1293            } else if prefix_byte == b'o' || prefix_byte == b'O' {
1294                // Octal (explicit): 0o[0-7_]+
1295                pos += 2; // consume '0o'
1296                let digit_start = pos;
1297                let mut saw_digit = false;
1298                while pos < bytes.len()
1299                    && ((bytes[pos] >= b'0' && bytes[pos] <= b'7') || bytes[pos] == b'_')
1300                {
1301                    saw_digit |= (b'0'..=b'7').contains(&bytes[pos]);
1302                    pos += 1;
1303                }
1304                if pos > digit_start && saw_digit {
1305                    self.position = pos;
1306                    let text = &self.input[start..self.position];
1307                    self.mode = LexerMode::ExpectOperator;
1308                    return Some(Token {
1309                        token_type: TokenType::Number(Arc::from(text)),
1310                        text: Arc::from(text),
1311                        start,
1312                        end: self.position,
1313                    });
1314                }
1315                // No octal digits after 0o - fall through to parse '0' as decimal
1316            }
1317        }
1318
1319        // Consume initial digits - unrolled for better performance
1320        pos = self.position;
1321        while pos < bytes.len() {
1322            let byte = Self::byte_at(bytes, pos);
1323            if byte.is_ascii_digit() || byte == b'_' {
1324                pos += 1;
1325            } else {
1326                break;
1327            }
1328        }
1329        self.position = pos;
1330
1331        // Check for decimal point - optimized with single bounds check
1332        if pos < bytes.len() && Self::byte_at(bytes, pos) == b'.' {
1333            // Peek ahead to see what follows the dot
1334            let has_following_digit = pos + 1 < bytes.len() && bytes[pos + 1].is_ascii_digit();
1335
1336            // Optimized dot consumption logic
1337            let should_consume_dot = has_following_digit || {
1338                pos + 1 >= bytes.len() || {
1339                    // Use bitwise operations for faster character classification
1340                    let next_byte = bytes[pos + 1];
1341                    // Whitespace, delimiters, operators - optimized check
1342                    next_byte <= b' '
1343                        || matches!(
1344                            next_byte,
1345                            b';' | b','
1346                                | b')'
1347                                | b'}'
1348                                | b']'
1349                                | b'+'
1350                                | b'-'
1351                                | b'*'
1352                                | b'/'
1353                                | b'%'
1354                                | b'='
1355                                | b'<'
1356                                | b'>'
1357                                | b'!'
1358                                | b'&'
1359                                | b'|'
1360                                | b'^'
1361                                | b'~'
1362                                | b'e'
1363                                | b'E'
1364                        )
1365                }
1366            };
1367
1368            if should_consume_dot {
1369                pos += 1; // consume the dot
1370                // Consume fractional digits - batch processing
1371                while pos < bytes.len() && (bytes[pos].is_ascii_digit() || bytes[pos] == b'_') {
1372                    pos += 1;
1373                }
1374                self.position = pos;
1375            }
1376        }
1377
1378        // Check for exponent - optimized
1379        if pos < bytes.len() && (bytes[pos] == b'e' || bytes[pos] == b'E') {
1380            let exp_start = pos;
1381            pos += 1; // consume 'e' or 'E'
1382
1383            // Check for optional sign
1384            if pos < bytes.len() && (bytes[pos] == b'+' || bytes[pos] == b'-') {
1385                pos += 1;
1386            }
1387
1388            // Must have at least one digit after exponent (underscores allowed between digits)
1389            let mut saw_digit = false;
1390            while pos < bytes.len() {
1391                let byte = bytes[pos];
1392                if byte.is_ascii_digit() {
1393                    saw_digit = true;
1394                    pos += 1;
1395                } else if byte == b'_' {
1396                    pos += 1;
1397                } else {
1398                    break;
1399                }
1400            }
1401
1402            // If no digits after exponent, backtrack
1403            if !saw_digit {
1404                pos = exp_start;
1405            }
1406
1407            self.position = pos;
1408        }
1409
1410        // Avoid string slicing for common number cases - use Arc::from directly on slice
1411        let text = &self.input[start..self.position];
1412        self.mode = LexerMode::ExpectOperator;
1413
1414        Some(Token {
1415            token_type: TokenType::Number(Arc::from(text)),
1416            text: Arc::from(text),
1417            start,
1418            end: self.position,
1419        })
1420    }
1421
1422    fn parse_decimal_number(&mut self, start: usize) -> Option<Token> {
1423        // We're at the dot, consume it
1424        self.advance();
1425
1426        // Parse the fractional part
1427        while self.position < self.input_bytes.len() {
1428            let byte = self.input_bytes[self.position];
1429            match byte {
1430                b'0'..=b'9' | b'_' => self.position += 1,
1431                b'e' | b'E' => {
1432                    // Handle scientific notation.
1433                    // Save the position of 'e'/'E' so we can backtrack here if
1434                    // no digits follow the exponent marker (with or without sign).
1435                    let e_pos = self.position;
1436                    self.advance();
1437                    if self.position < self.input_bytes.len() {
1438                        let next = self.input_bytes[self.position];
1439                        if next == b'+' || next == b'-' {
1440                            self.advance();
1441                        }
1442                    }
1443                    // Parse exponent digits (underscores allowed between digits)
1444                    let exponent_start = self.position;
1445                    let mut saw_digit = false;
1446                    while self.position < self.input_bytes.len() {
1447                        let byte = self.input_bytes[self.position];
1448                        if byte.is_ascii_digit() {
1449                            saw_digit = true;
1450                            self.position += 1;
1451                        } else if byte == b'_' {
1452                            self.position += 1;
1453                        } else {
1454                            break;
1455                        }
1456                    }
1457
1458                    // No digits after exponent marker — backtrack to just before
1459                    // 'e'/'E' so the caller sees it as a separate token.
1460                    // Using e_pos (not exponent_start-1) avoids including 'e' in
1461                    // the number slice when a sign character was consumed.
1462                    if !saw_digit {
1463                        let _ = exponent_start; // mark as intentionally unused
1464                        self.position = e_pos;
1465                    }
1466                    break;
1467                }
1468                _ => break,
1469            }
1470        }
1471
1472        let text = &self.input[start..self.position];
1473        self.mode = LexerMode::ExpectOperator;
1474
1475        Some(Token {
1476            token_type: TokenType::Number(Arc::from(text)),
1477            text: Arc::from(text),
1478            start,
1479            end: self.position,
1480        })
1481    }
1482
1483    fn try_variable(&mut self) -> Option<Token> {
1484        let start = self.position;
1485        let sigil = self.current_char()?;
1486
1487        match sigil {
1488            '$' | '@' | '%' | '*' => {
1489                // In ExpectOperator mode, treat % and * as operators rather than sigils
1490                if self.mode == LexerMode::ExpectOperator && matches!(sigil, '*' | '%') {
1491                    return None;
1492                }
1493                self.advance();
1494
1495                // Special case: After ->, sigils followed by { or [ should be tokenized separately
1496                // This is for postfix dereference like ->@*, ->%{}, ->@[]
1497                // We need to be careful with Unicode - check if we have enough bytes and valid char boundaries
1498                let check_arrow = self.position >= 3
1499                    && self.position.saturating_sub(1) <= self.input.len()
1500                    && self.input.is_char_boundary(self.position.saturating_sub(3))
1501                    && self.input.is_char_boundary(self.position.saturating_sub(1));
1502
1503                if check_arrow
1504                    && {
1505                        let saved = self.position;
1506                        self.position -= 3;
1507                        let arrow = self.matches_bytes(b"->");
1508                        self.position = saved;
1509                        arrow
1510                    }
1511                    && matches!(self.current_char(), Some('{' | '[' | '*'))
1512                {
1513                    // Just return the sigil
1514                    let text = &self.input[start..self.position];
1515                    self.mode = LexerMode::ExpectOperator;
1516
1517                    return Some(Token {
1518                        token_type: TokenType::Identifier(Arc::from(text)),
1519                        text: Arc::from(text),
1520                        start,
1521                        end: self.position,
1522                    });
1523                }
1524
1525                // Check for $# (array length operator)
1526                if sigil == '$' && self.current_char() == Some('#') {
1527                    self.advance(); // consume #
1528                    // Now parse the array name
1529                    while let Some(ch) = self.current_char() {
1530                        if is_perl_identifier_continue(ch) {
1531                            self.advance();
1532                        } else if ch == ':' && self.peek_char(1) == Some(':') {
1533                            // Package-qualified array name
1534                            self.advance();
1535                            self.advance();
1536                        } else {
1537                            break;
1538                        }
1539                    }
1540
1541                    let text = &self.input[start..self.position];
1542                    self.mode = LexerMode::ExpectOperator;
1543                    // $#foo is a complete variable token; a following `{` is a subscript.
1544                    self.after_var_subscript = true;
1545
1546                    return Some(Token {
1547                        token_type: TokenType::Identifier(Arc::from(text)),
1548                        text: Arc::from(text),
1549                        start,
1550                        end: self.position,
1551                    });
1552                }
1553
1554                // Check for special cases like ${^MATCH} or ${::{foo}} or *{$glob}
1555                if self.current_char() == Some('{') {
1556                    // Peek ahead to decide if we should consume the brace
1557                    let next_char = self.peek_char(1);
1558
1559                    // Check if this is a dereference like @{$ref} or @{[...]}
1560                    // If the next char suggests dereference, don't consume the brace.
1561                    // For @ and % sigils, identifiers inside braces are also derefs
1562                    // (e.g. @{Foo::Bar::baz} or %{Some::Hash}).
1563                    let is_deref = sigil != '*'
1564                        && (matches!(
1565                            next_char,
1566                            Some('$' | '@' | '%' | '*' | '&' | '[' | ' ' | '\t' | '\n' | '\r',)
1567                        ) || (matches!(sigil, '@' | '%')
1568                            && next_char.is_some_and(is_perl_identifier_start)));
1569                    if is_deref {
1570                        // This is a dereference, don't consume the brace
1571                        let text = &self.input[start..self.position];
1572                        self.mode = LexerMode::ExpectOperator;
1573                        // A standalone sigil token before `{` starts a dereference
1574                        // sequence (e.g. `${$ref}` / `@{$aref}` / `%{$href}` / `&{$cref}`).
1575                        // Mark it as subscript-capable so `{` increments brace depth
1576                        // and the closing `}` can enable chained `{...}` subscripts.
1577                        // (Broader form than master's `$|@|%` filter — `*` is already
1578                        // excluded by the `is_deref` guard above and `&` deref also
1579                        // benefits from chained-subscript handling.)
1580                        self.after_var_subscript = true;
1581
1582                        return Some(Token {
1583                            token_type: TokenType::Identifier(Arc::from(text)),
1584                            text: Arc::from(text),
1585                            start,
1586                            end: self.position,
1587                        });
1588                    }
1589
1590                    self.advance(); // consume {
1591
1592                    // Handle special variables with caret
1593                    if self.current_char() == Some('^') {
1594                        self.advance(); // consume ^
1595                        // Parse the special variable name
1596                        while let Some(ch) = self.current_char() {
1597                            if ch == '}' {
1598                                self.advance(); // consume }
1599                                break;
1600                            } else if is_perl_identifier_continue(ch) {
1601                                self.advance();
1602                            } else {
1603                                break;
1604                            }
1605                        }
1606                    }
1607                    // Handle stash access like $::{foo}
1608                    else if self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1609                        self.advance(); // consume first :
1610                        self.advance(); // consume second :
1611                        // Skip optional { and }
1612                        if self.current_char() == Some('{') {
1613                            self.advance();
1614                        }
1615                        // Parse the name
1616                        while let Some(ch) = self.current_char() {
1617                            if ch == '}' {
1618                                self.advance();
1619                                if self.current_char() == Some('}') {
1620                                    self.advance(); // consume closing } of ${...}
1621                                }
1622                                break;
1623                            } else if is_perl_identifier_continue(ch) {
1624                                self.advance();
1625                            } else {
1626                                break;
1627                            }
1628                        }
1629                    }
1630                    // Regular braced variable like ${foo} or glob like *{$glob}
1631                    else {
1632                        // Check if this is a dereference like ${$ref} or @{$ref} or @{[...]}
1633                        // If the next char is a sigil or other expression starter, we should stop here and let the parser handle it
1634                        // EXCEPT for globs - *{$glob} should be parsed as one token
1635                        // Also check for empty braces or EOF - in these cases we should split the tokens
1636                        if sigil != '*'
1637                            && (matches!(
1638                                self.current_char(),
1639                                Some(
1640                                    '$' | '@'
1641                                        | '%'
1642                                        | '*'
1643                                        | '&'
1644                                        | '['
1645                                        | ' '
1646                                        | '\t'
1647                                        | '\n'
1648                                        | '\r'
1649                                        | '}'
1650                                )
1651                            ) || self.current_char().is_none())
1652                        {
1653                            // This is a dereference or empty/invalid brace, backtrack
1654                            self.position = start + 1; // Just past the sigil
1655                            let text = &self.input[start..self.position];
1656                            self.mode = LexerMode::ExpectOperator;
1657                            // Same as above: sigil-only token means a dereference opener.
1658                            self.after_var_subscript = true;
1659
1660                            return Some(Token {
1661                                token_type: TokenType::Identifier(Arc::from(text)),
1662                                text: Arc::from(text),
1663                                start,
1664                                end: self.position,
1665                            });
1666                        }
1667
1668                        // For glob access, we need to consume everything inside braces
1669                        if sigil == '*' {
1670                            let mut brace_depth: usize = 1;
1671                            while let Some(ch) = self.current_char() {
1672                                if ch == '{' {
1673                                    brace_depth += 1;
1674                                } else if ch == '}' {
1675                                    brace_depth = brace_depth.saturating_sub(1);
1676                                    if brace_depth == 0 {
1677                                        self.advance(); // consume final }
1678                                        break;
1679                                    }
1680                                }
1681                                self.advance();
1682                            }
1683                        } else {
1684                            // Regular variable
1685                            while let Some(ch) = self.current_char() {
1686                                if ch == '}' {
1687                                    self.advance(); // consume }
1688                                    break;
1689                                } else if is_perl_identifier_continue(ch) {
1690                                    self.advance();
1691                                } else {
1692                                    break;
1693                                }
1694                            }
1695                        }
1696                    }
1697                }
1698                // Parse regular variable name
1699                else if let Some(ch) = self.current_char() {
1700                    if is_perl_identifier_start(ch) {
1701                        while let Some(ch) = self.current_char() {
1702                            if is_perl_identifier_continue(ch) {
1703                                self.advance();
1704                            } else {
1705                                break;
1706                            }
1707                        }
1708                        // Handle package-qualified segments like Foo::bar
1709                        while self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1710                            self.advance();
1711                            self.advance();
1712                            while let Some(ch) = self.current_char() {
1713                                if is_perl_identifier_continue(ch) {
1714                                    self.advance();
1715                                } else {
1716                                    break;
1717                                }
1718                            }
1719                        }
1720                    }
1721                    // Handle $^Letter (e.g. $^W, $^O, $^X) and bare $^ (format_top_name)
1722                    // Not inside prototypes where ^ is a literal prototype char
1723                    else if sigil == '$' && ch == '^' && !self.in_prototype {
1724                        self.advance(); // consume ^
1725                        // $^Letter: consume the single uppercase letter
1726                        if let Some(letter) = self.current_char()
1727                            && letter.is_ascii_uppercase()
1728                        {
1729                            self.advance();
1730                        }
1731                        // bare $^ (no uppercase letter follows): format_top_name — stop here
1732                    }
1733                    // Handle special punctuation variables
1734                    // Not inside prototypes where ; and , are literal prototype chars
1735                    else if sigil == '$'
1736                        && !self.in_prototype
1737                        && matches!(
1738                            ch,
1739                            '?' | '!'
1740                                | '@'
1741                                | '&'
1742                                | '`'
1743                                | '\''
1744                                | '.'
1745                                | '/'
1746                                | '\\'
1747                                | '|'
1748                                | '+'
1749                                | '-'
1750                                | '['
1751                                | ']'
1752                                | '$'
1753                                | '~'
1754                                | '='
1755                                | '%'
1756                                | ','
1757                                | '"'
1758                                | ';'
1759                                | '>'
1760                                | '<'
1761                                | ')'
1762                                | '(' // $( = real group ID of this process
1763                        )
1764                    {
1765                        self.advance(); // consume the special character
1766                    }
1767                    // $$ is the PID special variable, but only when it is not immediately
1768                    // followed by an identifier-start character. $$var is scalar dereference
1769                    // of $var, so keep the second $ for the next token.
1770                    else if sigil == '$' && ch == '$' {
1771                        if !self.peek_char(1).is_some_and(is_perl_identifier_start) {
1772                            self.advance(); // consume the second $ for bare $$ PID
1773                        }
1774                    }
1775                    // Handle special array/hash punctuation variables
1776                    else if (sigil == '@' || sigil == '%') && matches!(ch, '+' | '-') {
1777                        self.advance(); // consume the + or -
1778                    }
1779                }
1780
1781                let text = &self.input[start..self.position];
1782                self.mode = LexerMode::ExpectOperator;
1783                // A complete $foo, @foo, %foo token can be followed by a hash/slice
1784                // subscript `{`. Set the flag so the `{` handler knows to increment
1785                // hash_brace_depth. Glob tokens (*foo) are excluded: they don't take
1786                // hash subscripts in the same way.
1787                self.after_var_subscript = matches!(sigil, '$' | '@' | '%');
1788
1789                Some(Token {
1790                    token_type: TokenType::Identifier(Arc::from(text)),
1791                    text: Arc::from(text),
1792                    start,
1793                    end: self.position,
1794                })
1795            }
1796            _ => None,
1797        }
1798    }
1799
1800    /// Return the next non-space char and the char immediately following it (without consuming).
1801    /// Used to detect quote-operator delimiters while distinguishing `=>` (fat-arrow autoquote)
1802    /// from `=` used as a plain delimiter.
1803    fn peek_nonspace_and_following(&self) -> (Option<char>, Option<char>) {
1804        let mut i = self.position;
1805        while i < self.input.len() {
1806            let c = match self.input.get(i..).and_then(|s| s.chars().next()) {
1807                Some(c) => c,
1808                None => return (None, None),
1809            };
1810            if c.is_whitespace() {
1811                i += c.len_utf8();
1812                continue;
1813            }
1814            // Found non-space at position i; peek the next char after it
1815            let j = i + c.len_utf8();
1816            let following = self.input.get(j..).and_then(|s| s.chars().next());
1817            return (Some(c), following);
1818        }
1819        (None, None)
1820    }
1821
1822    /// Is `c` a valid quote-like delimiter? (non-alnum, including paired)
1823    fn is_quote_delim(c: char) -> bool {
1824        // Perl allows any non-alphanumeric, non-whitespace character as delimiter,
1825        // including control characters (e.g. s\x07pattern\x07replacement\x07).
1826        !c.is_ascii_alphanumeric() && !c.is_whitespace()
1827    }
1828
1829    /// Try to parse a v-string (version string) like `v5.26.0` or `v5.10`.
1830    ///
1831    /// A v-string starts with `v` followed by one or more digits, then optionally
1832    /// `.` followed by digits, repeated. The `v` prefix distinguishes these from
1833    /// normal identifiers. Examples: `v5.26.0`, `v5.10`, `v1.2.3.4`.
1834    #[inline]
1835    fn try_vstring(&mut self) -> Option<Token> {
1836        let start = self.position;
1837        let bytes = self.input_bytes;
1838
1839        // Must start with 'v' followed by at least one digit
1840        if start >= bytes.len() || bytes[start] != b'v' {
1841            return None;
1842        }
1843
1844        let next_pos = start + 1;
1845        if next_pos >= bytes.len() || !bytes[next_pos].is_ascii_digit() {
1846            return None;
1847        }
1848
1849        // We have `v` followed by a digit — scan the rest of the v-string.
1850        // Pattern: v DIGITS (.DIGITS)*
1851        let mut pos = next_pos;
1852
1853        // Consume leading digits
1854        while pos < bytes.len() && bytes[pos].is_ascii_digit() {
1855            pos += 1;
1856        }
1857
1858        // Consume optional `.DIGITS` segments (require at least one digit after dot)
1859        while pos < bytes.len() && bytes[pos] == b'.' {
1860            let dot_pos = pos;
1861            pos += 1; // skip '.'
1862
1863            if pos >= bytes.len() || !bytes[pos].is_ascii_digit() {
1864                // Dot not followed by digit — not part of the v-string
1865                pos = dot_pos;
1866                break;
1867            }
1868
1869            // Consume digits after the dot
1870            while pos < bytes.len() && bytes[pos].is_ascii_digit() {
1871                pos += 1;
1872            }
1873        }
1874
1875        // Make sure the v-string isn't followed by identifier-continuation characters
1876        // (e.g. `v5x` should remain an identifier, not a v-string `v5` + `x`)
1877        if pos < bytes.len() {
1878            let next_byte = bytes[pos];
1879            if next_byte == b'_' || next_byte.is_ascii_alphabetic() {
1880                return None;
1881            }
1882            // Also check for non-ASCII identifier continuations
1883            if next_byte >= 128
1884                && let Some(ch) = self.input.get(pos..).and_then(|s| s.chars().next())
1885                && is_perl_identifier_continue(ch)
1886            {
1887                return None;
1888            }
1889        }
1890
1891        // `v5` (no dots) is a valid Perl v-string meaning chr(5).
1892        let text = &self.input[start..pos];
1893
1894        self.position = pos;
1895        self.mode = LexerMode::ExpectOperator;
1896
1897        Some(Token {
1898            token_type: TokenType::Version(Arc::from(text)),
1899            text: Arc::from(text),
1900            start,
1901            end: self.position,
1902        })
1903    }
1904
1905    #[inline]
1906    fn try_identifier_or_keyword(&mut self) -> Option<Token> {
1907        let start = self.position;
1908        let ch = self.current_char()?;
1909        let bytes = self.input_bytes;
1910        let len = bytes.len();
1911
1912        if is_perl_identifier_start(ch) {
1913            // Special case: substitution/transliteration with single-quote delimiter
1914            // The single quote is considered an identifier continuation, so we need to
1915            // detect these operators before consuming it as part of an identifier.
1916            if !self.after_arrow
1917                && self.hash_brace_depth == 0
1918                && ch == 's'
1919                && self.peek_char(1) == Some('\'')
1920            {
1921                self.advance(); // consume 's'
1922                return self.parse_substitution(start);
1923            } else if !self.after_arrow
1924                && self.hash_brace_depth == 0
1925                && ch == 'y'
1926                && self.peek_char(1) == Some('\'')
1927            {
1928                self.advance(); // consume 'y'
1929                return self.parse_transliteration(start);
1930            } else if !self.after_arrow
1931                && self.hash_brace_depth == 0
1932                && ch == 't'
1933                && self.peek_char(1) == Some('r')
1934                && self.peek_char(2) == Some('\'')
1935            {
1936                self.advance(); // consume 't'
1937                self.advance(); // consume 'r'
1938                return self.parse_transliteration(start);
1939            }
1940
1941            // Fast ASCII path for identifier continuation.
1942            while self.position < len {
1943                let byte = bytes[self.position];
1944                if byte == b'\'' && is_quote_op_word_prefix(&bytes[start..self.position]) {
1945                    // Keep apostrophe for quote-operator parsing in cases like q'...'.
1946                    break;
1947                }
1948
1949                if byte.is_ascii_alphanumeric() || byte == b'_' || byte == b'\'' {
1950                    self.position += 1;
1951                    continue;
1952                }
1953
1954                if byte < 128 {
1955                    break;
1956                }
1957
1958                if let Some(ch) = self.current_char()
1959                    && is_perl_identifier_continue(ch)
1960                {
1961                    self.advance();
1962                    continue;
1963                }
1964                break;
1965            }
1966            // Handle package-qualified identifiers like Foo::bar.
1967            while self.config.max_lookahead >= 1
1968                && self.position + 1 < len
1969                && bytes[self.position] == b':'
1970                && bytes[self.position + 1] == b':'
1971            {
1972                self.position += 2; // consume '::'
1973
1974                // consume following identifier segment if present
1975                let Some(ch) = self.current_char() else {
1976                    break;
1977                };
1978                if !is_perl_identifier_start(ch) {
1979                    break;
1980                }
1981                self.advance();
1982                while self.position < len {
1983                    let byte = bytes[self.position];
1984                    if byte.is_ascii_alphanumeric() || byte == b'_' || byte == b'\'' {
1985                        self.position += 1;
1986                        continue;
1987                    }
1988                    if byte < 128 {
1989                        break;
1990                    }
1991                    if let Some(ch) = self.current_char()
1992                        && is_perl_identifier_continue(ch)
1993                    {
1994                        self.advance();
1995                        continue;
1996                    }
1997                    break;
1998                }
1999            }
2000
2001            let text = &self.input[start..self.position];
2002
2003            // Check for __DATA__ and __END__ markers using exact match
2004            // Only recognize these in code channel, not inside data/format sections or heredocs
2005            let in_code_channel =
2006                !matches!(self.mode, LexerMode::InDataSection | LexerMode::InFormatBody)
2007                    && self.pending_heredocs.is_empty();
2008
2009            let marker = if in_code_channel {
2010                if text == "__DATA__" {
2011                    Some("__DATA__")
2012                } else if text == "__END__" {
2013                    Some("__END__")
2014                } else {
2015                    None
2016                }
2017            } else {
2018                None
2019            };
2020
2021            if let Some(marker_text) = marker {
2022                // These must be at the beginning of a line
2023                // Use the after_newline flag to determine if we're at line start
2024                if self.after_newline {
2025                    // Check if rest of line is only whitespace
2026                    // Only treat as data marker if line has no trailing junk
2027                    if Self::trailing_ws_only(self.input_bytes, self.position) {
2028                        // Consume the rest of the line (the marker line)
2029                        while self.position < self.input.len()
2030                            && self.input_bytes[self.position] != b'\n'
2031                            && self.input_bytes[self.position] != b'\r'
2032                        {
2033                            self.advance();
2034                        }
2035                        self.consume_newline();
2036
2037                        // Switch to data section mode
2038                        self.mode = LexerMode::InDataSection;
2039
2040                        return Some(Token {
2041                            token_type: TokenType::DataMarker(Arc::from(marker_text)),
2042                            text: Arc::from(marker_text),
2043                            start,
2044                            end: self.position,
2045                        });
2046                    }
2047                }
2048            }
2049
2050            // Check for substitution/transliteration operators
2051            // Skip if after '->'  -- these are method names, not operators.
2052            #[allow(clippy::collapsible_if)]
2053            if !self.after_arrow && self.hash_brace_depth == 0 && matches!(text, "s" | "tr" | "y") {
2054                let immediate = self.current_char();
2055                let (candidate, char_after_next, has_whitespace) =
2056                    if immediate.is_some_and(|c| c.is_whitespace()) {
2057                        let (nc, ca) = self.peek_nonspace_and_following();
2058                        (nc, ca, true)
2059                    } else {
2060                        let following = immediate.and_then(|c| {
2061                            let j = self.position + c.len_utf8();
2062                            self.input.get(j..).and_then(|s| s.chars().next())
2063                        });
2064                        (immediate, following, false)
2065                    };
2066
2067                if let Some(next) = candidate {
2068                    // `s => 1` should remain a fat-arrow hash key, not quote op.
2069                    let is_fat_arrow = next == '=' && char_after_next == Some('>');
2070                    let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
2071                    let is_quote_char = matches!(next, '\'' | '"') && text != "s";
2072                    let transliteration_allows_whitespace = text == "tr" || text == "y";
2073                    let substitution_disallows_whitespace = text == "s" && has_whitespace;
2074                    let is_valid_delim = Self::is_quote_delim(next)
2075                        && !is_fat_arrow
2076                        && !substitution_disallows_whitespace
2077                        && (!has_whitespace
2078                            || is_paired_delim
2079                            || is_quote_char
2080                            || transliteration_allows_whitespace);
2081
2082                    if is_valid_delim {
2083                        match text {
2084                            "s" => return self.parse_substitution(start),
2085                            "tr" | "y" => return self.parse_transliteration(start),
2086                            unexpected => {
2087                                return Some(Token {
2088                                    token_type: TokenType::Error(Arc::from(format!(
2089                                        "Unexpected substitution operator '{}': expected 's', 'tr', or 'y' at position {}",
2090                                        unexpected, start
2091                                    ))),
2092                                    text: Arc::from(unexpected),
2093                                    start,
2094                                    end: self.position,
2095                                });
2096                            }
2097                        }
2098                    }
2099                }
2100            }
2101
2102            let token_type = if is_keyword_fast(text) {
2103                // Check for special keywords that affect lexer mode
2104                match text {
2105                    "if" | "unless" | "while" | "until" | "for" | "foreach" | "grep" | "map"
2106                    | "sort" | "split" | "and" | "or" | "xor" | "not"
2107                    // These keywords introduce an expression, so a following `/` is a
2108                    // regex, not division.  `return /re/`, `die /re/`, `warn /re/`,
2109                    // `do /file/`, and `eval /re/` are all valid Perl.
2110                    | "return" | "die" | "warn" | "do" | "eval" => {
2111                        self.mode = LexerMode::ExpectTerm;
2112                    }
2113                    "sub" => {
2114                        self.after_sub = true;
2115                        self.mode = LexerMode::ExpectTerm;
2116                    }
2117                    // Quote operators expect a delimiter next.
2118                    // Skip if after '->' -- these are method names, not operators.
2119                    // Skip inside hash subscript braces (hash_brace_depth > 0) — all
2120                    // positions inside `$h{...}` or `@h{...}` treat quote-op names as
2121                    // bareword keys, including after commas in slices like `@h{m, s}`.
2122                    op if !self.after_arrow
2123                        && self.hash_brace_depth == 0
2124                        && quote_handler::is_quote_operator(op) =>
2125                    {
2126                        // Perl allows whitespace between a quote-like operator and its delimiter,
2127                        // but ONLY for paired delimiters (s { ... } { ... }g).
2128                        // For non-paired delimiters (s/foo/bar/, s,foo,bar,), the delimiter
2129                        // must be immediately adjacent — otherwise `s $foo` would wrongly
2130                        // treat `$` as a delimiter instead of being a bareword `s` followed
2131                        // by a scalar variable.
2132                        //
2133                        // Strategy:
2134                        //   1. Check the immediately-adjacent char first (no whitespace skip).
2135                        //      If it is a valid delimiter → any non-alnum, non-whitespace char.
2136                        //   2. If the adjacent char is whitespace, peek past it.
2137                        //      Only accept PAIRED delimiters ({, [, (, <) in that case.
2138                        let immediate = self.current_char();
2139                        let (candidate, char_after_next, has_whitespace) =
2140                            if immediate.is_some_and(|c| c.is_whitespace()) {
2141                                // There is whitespace — peek past it
2142                                let (nc, ca) = self.peek_nonspace_and_following();
2143                                (nc, ca, true)
2144                            } else {
2145                                // No whitespace — use immediate char
2146                                let following = immediate.and_then(|c| {
2147                                    let j = self.position + c.len_utf8();
2148                                    self.input.get(j..).and_then(|s| s.chars().next())
2149                                });
2150                                (immediate, following, false)
2151                            };
2152
2153                        if let Some(next) = candidate {
2154                            // Fat-arrow autoquoting: `s => value` — `=` followed by `>` is '=>',
2155                            // not a valid substitution delimiter. Treat as identifier.
2156                            let is_fat_arrow = next == '=' && char_after_next == Some('>');
2157
2158                            // When whitespace precedes the delimiter, only unambiguous
2159                            // delimiters are accepted:
2160                            //   - Paired delimiters ({, [, (, <) are always safe.
2161                            //   - ' and " are safe for all operators EXCEPT `s` — `-s 'filename'`
2162                            //     is a valid file-size filetest and must not be treated as a
2163                            //     substitution start. All other operators (qw, q, qq, qr, qx, m,
2164                            //     tr, y) have no corresponding file-test operator.
2165                            //   - Non-paired, non-quote chars ($, @, ,, etc.) remain rejected.
2166                            let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
2167                            let is_quote_char = matches!(next, '\'' | '"') && op != "s";
2168                            let is_valid_delim = Self::is_quote_delim(next)
2169                                && !is_fat_arrow
2170                                && (!has_whitespace || is_paired_delim || is_quote_char);
2171
2172                            if is_valid_delim {
2173                                self.mode = LexerMode::ExpectDelimiter;
2174                                self.current_quote_op = Some(quote_handler::QuoteOperatorInfo {
2175                                    operator: op.to_string(),
2176                                    delimiter: '\0', // Will be set when we see the delimiter
2177                                    start_pos: start,
2178                                });
2179
2180                                // Don't return a keyword token - continue to parse the delimiter
2181                                // Skip any whitespace between operator and delimiter
2182                                while let Some(ch) = self.current_char() {
2183                                    if ch.is_whitespace() {
2184                                        self.advance();
2185                                    } else {
2186                                        break;
2187                                    }
2188                                }
2189
2190                                // Get the delimiter
2191                                #[allow(clippy::collapsible_if)]
2192                                if let Some(delim) = self.current_char() {
2193                                    if !delim.is_alphanumeric() {
2194                                        self.advance();
2195                                        if let Some(ref mut info) = self.current_quote_op {
2196                                            info.delimiter = delim;
2197                                        }
2198                                        // Parse the quote operator content and return the complete token
2199                                        return self.parse_quote_operator(delim);
2200                                    }
2201                                }
2202                            } else {
2203                                // Not a quote operator here → treat as IDENTIFIER
2204                                self.current_quote_op = None;
2205                                self.mode = LexerMode::ExpectOperator;
2206                                return Some(Token {
2207                                    token_type: TokenType::Identifier(Arc::from(text)),
2208                                    start,
2209                                    end: self.position,
2210                                    text: Arc::from(text),
2211                                });
2212                            }
2213                        } else {
2214                            // End-of-input after the word → also treat as IDENTIFIER
2215                            self.current_quote_op = None;
2216                            self.mode = LexerMode::ExpectOperator;
2217                            return Some(Token {
2218                                token_type: TokenType::Identifier(Arc::from(text)),
2219                                start,
2220                                end: self.position,
2221                                text: Arc::from(text),
2222                            });
2223                        }
2224                        // If we get here but haven't returned, something went wrong
2225                        // Fall through to treat as identifier
2226                        self.current_quote_op = None;
2227                        self.mode = LexerMode::ExpectOperator;
2228                        return Some(Token {
2229                            token_type: TokenType::Identifier(Arc::from(text)),
2230                            start,
2231                            end: self.position,
2232                            text: Arc::from(text),
2233                        });
2234                    }
2235                    // Format declarations need special handling
2236                    "format" => {
2237                        // We'll need to check for the = after the format name
2238                        // For now, just mark that we saw format
2239                    }
2240                    _ if is_builtin_function(text) => {
2241                        // Bare builtins are term-introducing in Perl.
2242                        self.mode = LexerMode::ExpectTerm;
2243                    }
2244                    _ => {
2245                        self.mode = LexerMode::ExpectOperator;
2246                    }
2247                }
2248                TokenType::Keyword(Arc::from(text))
2249            } else {
2250                // Mirror parser bare-builtin handling so `/` after builtins like
2251                // `join` or `print` is lexed as a regex term, not division.
2252                if is_builtin_function(text) {
2253                    self.mode = LexerMode::ExpectTerm;
2254                } else {
2255                    self.mode = LexerMode::ExpectOperator;
2256                }
2257                TokenType::Identifier(Arc::from(text))
2258            };
2259
2260            self.after_arrow = false;
2261            // A keyword/identifier is not a variable; `{` after it is a block opener.
2262            self.after_var_subscript = false;
2263            // hash_brace_depth is managed by { and } handlers, not cleared per-token
2264            Some(Token { token_type, text: Arc::from(text), start, end: self.position })
2265        } else {
2266            None
2267        }
2268    }
2269
2270    /// Parse data section body - consumes everything to EOF
2271    fn parse_data_body(&mut self) -> Option<Token> {
2272        if self.position >= self.input.len() {
2273            // Already at EOF
2274            self.mode = LexerMode::ExpectTerm;
2275            return Some(Token {
2276                token_type: TokenType::EOF,
2277                text: Arc::from(""),
2278                start: self.position,
2279                end: self.position,
2280            });
2281        }
2282
2283        let start = self.position;
2284        // Consume everything to EOF
2285        let body = &self.input[self.position..];
2286        self.position = self.input.len();
2287
2288        // Reset mode for next parse (though we're at EOF)
2289        self.mode = LexerMode::ExpectTerm;
2290
2291        Some(Token {
2292            token_type: TokenType::DataBody(Arc::from(body)),
2293            text: Arc::from(body),
2294            start,
2295            end: self.position,
2296        })
2297    }
2298
2299    /// Parse format body - consumes until a line with just a dot
2300    fn parse_format_body(&mut self) -> Option<Token> {
2301        let start = self.position;
2302        let mut body = String::new();
2303        let mut line_start = true;
2304
2305        while self.position < self.input.len() {
2306            // Check if we're at the start of a line and the next char is a dot
2307            if line_start && self.current_char() == Some('.') {
2308                // Check if this line contains only a dot
2309                let mut peek_pos = self.position + 1;
2310                let mut found_terminator = true;
2311
2312                // Skip any trailing whitespace on the dot line
2313                while peek_pos < self.input.len() {
2314                    match self.input_bytes[peek_pos] {
2315                        b' ' | b'\t' | b'\r' => peek_pos += 1,
2316                        b'\n' => break,
2317                        _ => {
2318                            found_terminator = false;
2319                            break;
2320                        }
2321                    }
2322                }
2323
2324                if found_terminator {
2325                    // We found the terminating dot, consume it
2326                    self.position = peek_pos;
2327                    if self.position < self.input.len() && self.input_bytes[self.position] == b'\n'
2328                    {
2329                        self.position += 1;
2330                    }
2331
2332                    // Switch back to normal mode
2333                    self.mode = LexerMode::ExpectTerm;
2334
2335                    return Some(Token {
2336                        token_type: TokenType::FormatBody(Arc::from(body.clone())),
2337                        text: Arc::from(body),
2338                        start,
2339                        end: self.position,
2340                    });
2341                }
2342            }
2343
2344            // Not a terminator, consume the character
2345            match self.current_char() {
2346                Some(ch) => {
2347                    body.push(ch);
2348                    self.advance();
2349
2350                    // Track if we're at the start of a line
2351                    line_start = ch == '\n';
2352                }
2353                None => {
2354                    // Reached EOF without finding terminator
2355                    break;
2356                }
2357            }
2358        }
2359
2360        // If we reach here, we didn't find a terminator
2361        self.mode = LexerMode::ExpectTerm;
2362        Some(Token {
2363            token_type: TokenType::Error(Arc::from("Unterminated format body")),
2364            text: Arc::from(body),
2365            start,
2366            end: self.position,
2367        })
2368    }
2369
2370    fn try_operator(&mut self) -> Option<Token> {
2371        // Skip operator parsing if we're expecting a delimiter for a quote operator
2372        if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2373            return None;
2374        }
2375
2376        let start = self.position;
2377        let ch = self.current_char()?;
2378
2379        // ═══════════════════════════════════════════════════════════════════════
2380        // SLASH DISAMBIGUATION STRATEGY (Issue #422)
2381        // ═══════════════════════════════════════════════════════════════════════
2382        //
2383        // Perl's `/` character is ambiguous:
2384        //   - Division operator: `$x / 2`
2385        //   - Regex delimiter: `/pattern/`
2386        //   - Defined-or operator: `$x // $y`
2387        //
2388        // **Disambiguation Strategy (Context-Aware Heuristics):**
2389        //
2390        // 1. **Mode-Based Decision (Primary)**:
2391        //    - `LexerMode::ExpectTerm` → `/` starts a regex
2392        //      Examples: `if (/pattern/)`, `=~ /test/`, `( /regex/`
2393        //    - `LexerMode::ExpectOperator` → `/` is division or `//`
2394        //      Examples: `$x / 2`, `$x // $y`, `) / 3`
2395        //
2396        // 2. **Context Heuristics (Secondary - Implicit in Mode)**:
2397        //    Mode is set based on previous token:
2398        //    - After identifier/number/closing paren → ExpectOperator → division
2399        //    - After operator/keyword/opening paren → ExpectTerm → regex
2400        //
2401        // 3. **Budget Protection**:
2402        //    - Regex parsing has a parse-step budget and byte budget
2403        //    - Budget exceeded → emit UnknownRest token (graceful degradation)
2404        //    - See `parse_regex()` and `budget_guard()` for implementation
2405        //
2406        // 4. **Performance Characteristics**:
2407        //    - Single-pass: O(1) decision based on mode flag
2408        //    - No backtracking: Mode updated after each token
2409        //    - Optimized: Byte-level operations for common cases
2410        //
2411        // **Metrics & Monitoring**:
2412        //    - Budget exceeded events tracked via UnknownRest token emission
2413        //    - LSP diagnostics generated for truncated regexes
2414        //    - Test coverage: lexer_slash_timeout_tests.rs (21 test cases)
2415        //
2416        // ═══════════════════════════════════════════════════════════════════════
2417
2418        if ch == '/' {
2419            if self.mode == LexerMode::ExpectTerm {
2420                // Mode indicates we're expecting a term → `/` starts a regex
2421                // Examples: `if (/pattern/)`, `=~ /test/`, `while (/match/)`
2422                return self.parse_regex(start);
2423            } else {
2424                // Mode indicates we're expecting an operator → `/` is division or `//`
2425                // Examples: `$x / 2`, `$x // $y`, `10 / 3`
2426                self.advance();
2427                // Check for // or //= using byte-level operations for speed
2428                if self.peek_byte(0) == Some(b'/') {
2429                    self.position += 1; // consume second / directly
2430                    if self.peek_byte(0) == Some(b'=') {
2431                        self.position += 1; // consume = directly
2432                        let text = &self.input[start..self.position];
2433                        self.mode = LexerMode::ExpectTerm;
2434                        return Some(Token {
2435                            token_type: TokenType::Operator(Arc::from(text)),
2436                            text: Arc::from(text),
2437                            start,
2438                            end: self.position,
2439                        });
2440                    } else {
2441                        // Use cached string for common "//" operator
2442                        self.mode = LexerMode::ExpectTerm;
2443                        return Some(Token {
2444                            token_type: TokenType::Operator(Arc::from("//")),
2445                            text: Arc::from("//"),
2446                            start,
2447                            end: self.position,
2448                        });
2449                    }
2450                } else if self.position < self.input_bytes.len()
2451                    && self.input_bytes[self.position] == b'='
2452                {
2453                    // /= division-assign operator
2454                    self.position += 1; // consume =
2455                    self.mode = LexerMode::ExpectTerm;
2456                    return Some(Token {
2457                        token_type: TokenType::Operator(Arc::from("/=")),
2458                        text: Arc::from("/="),
2459                        start,
2460                        end: self.position,
2461                    });
2462                } else {
2463                    // Use cached string for common "/" division
2464                    self.mode = LexerMode::ExpectTerm;
2465                    return Some(Token {
2466                        token_type: TokenType::Division,
2467                        text: Arc::from("/"),
2468                        start,
2469                        end: self.position,
2470                    });
2471                }
2472            }
2473        }
2474
2475        // Handle other operators - simplified
2476        match ch {
2477            '.' => {
2478                // Check if it's a decimal number like .5 -- but only when we
2479                // expect a term.  In operator position `.5` is concatenation
2480                // of the bareword/number on the left with the number `5`.
2481                if self.mode != LexerMode::ExpectOperator
2482                    && self.peek_char(1).is_some_and(|c| c.is_ascii_digit())
2483                {
2484                    return self.parse_decimal_number(start);
2485                }
2486                self.advance();
2487                // Check for compound operators
2488                #[allow(clippy::collapsible_if)]
2489                if let Some(next) = self.current_char() {
2490                    if is_compound_operator(ch, next) {
2491                        self.advance();
2492
2493                        // Check for three-character operators like **=, <<=, >>=
2494                        if self.position < self.input.len() {
2495                            let third = self.current_char();
2496                            // Check for three-character operators
2497                            if matches!(
2498                                (ch, next, third),
2499                                ('*', '*', Some('='))
2500                                    | ('<', '<', Some('='))
2501                                    | ('>', '>', Some('='))
2502                                    | ('&', '&', Some('='))
2503                                    | ('|', '|', Some('='))
2504                                    | ('/', '/', Some('='))
2505                            ) {
2506                                self.advance(); // consume the =
2507                            } else if ch == '<' && next == '=' && third == Some('>') {
2508                                self.advance(); // consume the >
2509                            // Special case: <=> spaceship operator
2510                            } else if ch == '.' && next == '.' && third == Some('.') {
2511                                self.advance(); // consume the third .
2512                            }
2513                        }
2514                    }
2515                }
2516            }
2517            '+' | '-' | '*' | '%' | '&' | '|' | '^' | '~' | '!' | '=' | '<' | '>' | ':' | '?'
2518            | '\\' => {
2519                self.advance();
2520                // Check for compound operators
2521                #[allow(clippy::collapsible_if)]
2522                if let Some(next) = self.current_char() {
2523                    if is_compound_operator(ch, next) {
2524                        self.advance();
2525
2526                        // Check for three-character operators like **=, <<=, >>=
2527                        if self.position < self.input.len() {
2528                            let third = self.current_char();
2529                            // Check for three-character operators
2530                            if matches!(
2531                                (ch, next, third),
2532                                ('*', '*', Some('='))
2533                                    | ('<', '<', Some('='))
2534                                    | ('>', '>', Some('='))
2535                                    | ('&', '&', Some('='))
2536                                    | ('|', '|', Some('='))
2537                                    | ('/', '/', Some('='))
2538                            ) {
2539                                self.advance(); // consume the =
2540                            } else if ch == '<' && next == '=' && third == Some('>') {
2541                                self.advance(); // consume the >
2542                                // Special case: <=> spaceship operator
2543                            }
2544                        }
2545                    }
2546                }
2547            }
2548            _ => return None,
2549        }
2550
2551        let text = &self.input[start..self.position];
2552        // Operator ends prototype window (e.g. `:` for attributes)
2553        self.after_sub = false;
2554        // Track whether this operator is '->' for method name disambiguation
2555        self.after_arrow = text == "->";
2556        // Any operator token ends the "just saw a variable" window; `{` after
2557        // an operator is not a hash subscript (e.g. `foo() {`, `+ {`, etc.).
2558        self.after_var_subscript = false;
2559        // Postfix ++ and -- complete a term expression, so next token is an operator
2560        // (e.g., "$x++ / 2" → / is division, not regex)
2561        if (text == "++" || text == "--") && self.mode == LexerMode::ExpectOperator {
2562            // Postfix: stay in ExpectOperator
2563        } else {
2564            self.mode = LexerMode::ExpectTerm;
2565        }
2566
2567        Some(Token {
2568            token_type: TokenType::Operator(Arc::from(text)),
2569            text: Arc::from(text),
2570            start,
2571            end: self.position,
2572        })
2573    }
2574
2575    fn try_delimiter(&mut self) -> Option<Token> {
2576        let start = self.position;
2577        let ch = self.current_char()?;
2578
2579        // If we're expecting a delimiter for a quote operator, handle it specially
2580        if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2581            // Accept any non-alphanumeric character as a delimiter
2582            if !ch.is_alphanumeric() && !ch.is_whitespace() {
2583                self.advance();
2584                if let Some(ref mut info) = self.current_quote_op {
2585                    info.delimiter = ch;
2586                }
2587                // Now parse the quote operator content
2588                return self.parse_quote_operator(ch);
2589            }
2590        }
2591
2592        match ch {
2593            '(' => {
2594                // Check if this is a quote operator delimiter
2595                if matches!(self.mode, LexerMode::ExpectDelimiter)
2596                    && self.current_quote_op.is_some()
2597                {
2598                    self.advance();
2599                    if let Some(ref mut info) = self.current_quote_op {
2600                        info.delimiter = ch;
2601                    }
2602                    return self.parse_quote_operator(ch);
2603                }
2604
2605                self.advance();
2606                if self.after_sub {
2607                    // Promote after_sub to in_prototype now that we see '('
2608                    self.in_prototype = true;
2609                    self.after_sub = false;
2610                    self.prototype_depth = 1;
2611                } else if self.in_prototype {
2612                    self.prototype_depth += 1;
2613                }
2614                self.paren_depth += 1;
2615                self.after_var_subscript = false;
2616                self.mode = LexerMode::ExpectTerm;
2617                Some(Token {
2618                    token_type: TokenType::LeftParen,
2619                    text: Arc::from("("),
2620                    start,
2621                    end: self.position,
2622                })
2623            }
2624            ')' => {
2625                self.advance();
2626                if self.in_prototype && self.prototype_depth > 0 {
2627                    self.prototype_depth -= 1;
2628                    if self.prototype_depth == 0 {
2629                        self.in_prototype = false;
2630                    }
2631                }
2632                self.after_arrow = false;
2633                self.paren_depth = self.paren_depth.saturating_sub(1);
2634                // A closing paren ends any var-subscript context: `if ($var)` should
2635                // NOT leave after_var_subscript set, otherwise the following `{` would
2636                // incorrectly increment hash_brace_depth and suppress regex operators
2637                // inside the block body (issue #2844).
2638                self.after_var_subscript = false;
2639                self.mode = LexerMode::ExpectOperator;
2640                Some(Token {
2641                    token_type: TokenType::RightParen,
2642                    text: Arc::from(")"),
2643                    start,
2644                    end: self.position,
2645                })
2646            }
2647            ';' => {
2648                self.advance();
2649                // Semicolon ends prototype window (forward declaration)
2650                self.after_sub = false;
2651                // Semicolon is a statement boundary — any pending method-call chain is over.
2652                self.after_arrow = false;
2653                self.after_var_subscript = false;
2654                self.mode = LexerMode::ExpectTerm;
2655                Some(Token {
2656                    token_type: TokenType::Semicolon,
2657                    text: Arc::from(";"),
2658                    start,
2659                    end: self.position,
2660                })
2661            }
2662            ',' => {
2663                self.advance();
2664                self.after_var_subscript = false;
2665                self.mode = LexerMode::ExpectTerm;
2666                Some(Token {
2667                    token_type: TokenType::Comma,
2668                    text: Arc::from(","),
2669                    start,
2670                    end: self.position,
2671                })
2672            }
2673            '[' => {
2674                self.advance();
2675                self.after_var_subscript = false;
2676                self.mode = LexerMode::ExpectTerm;
2677                Some(Token {
2678                    token_type: TokenType::LeftBracket,
2679                    text: Arc::from("["),
2680                    start,
2681                    end: self.position,
2682                })
2683            }
2684            ']' => {
2685                self.advance();
2686                // A closing `]` from an array subscript leaves us in a state where
2687                // a `{` immediately following is a hash subscript — e.g. `$arr[$i]{key}`.
2688                // Set after_var_subscript so the `{` handler recognises it as such.
2689                // This mirrors the `}` handler's behavior when closing a hash subscript.
2690                self.after_var_subscript = true;
2691                self.mode = LexerMode::ExpectOperator;
2692                Some(Token {
2693                    token_type: TokenType::RightBracket,
2694                    text: Arc::from("]"),
2695                    start,
2696                    end: self.position,
2697                })
2698            }
2699            '{' => {
2700                self.advance();
2701                // Opening brace ends prototype window — no prototype follows
2702                self.after_sub = false;
2703                // `{` is a hash/slice subscript opener only when it immediately follows
2704                // a variable token ($x, @x, %x) — tracked by `after_var_subscript`.
2705                // This is narrower than the old `mode == ExpectOperator` check, which
2706                // incorrectly incremented depth for block-opening braces after `sub foo`,
2707                // `if (cond)`, `else`, `while (cond)`, etc., causing quote-op suppression
2708                // inside those block bodies and breaking m//, s///, qr//, tr/// etc.
2709                if self.after_var_subscript {
2710                    self.hash_brace_depth = self.hash_brace_depth.saturating_add(1);
2711                }
2712                self.after_var_subscript = false;
2713                self.mode = LexerMode::ExpectTerm;
2714                Some(Token {
2715                    token_type: TokenType::LeftBrace,
2716                    text: Arc::from("{"),
2717                    start,
2718                    end: self.position,
2719                })
2720            }
2721            '}' => {
2722                self.advance();
2723                self.after_arrow = false;
2724                // Decrement hash subscript brace depth only if we were inside one.
2725                // If depth > 0, this closes a hash subscript; enable chained subscripts
2726                // like $h{a}{b} by setting after_var_subscript so the next `{` is
2727                // recognized as another subscript opener.
2728                if self.hash_brace_depth > 0 {
2729                    self.hash_brace_depth -= 1;
2730                    // The subscript value is now the "variable" for a chained subscript.
2731                    self.after_var_subscript = true;
2732                } else {
2733                    // Block-close `}` — no subscript follows
2734                    self.after_var_subscript = false;
2735                }
2736                self.mode = LexerMode::ExpectOperator;
2737                Some(Token {
2738                    token_type: TokenType::RightBrace,
2739                    text: Arc::from("}"),
2740                    start,
2741                    end: self.position,
2742                })
2743            }
2744            '#' => {
2745                // Only treat as delimiter in ExpectDelimiter mode
2746                if matches!(self.mode, LexerMode::ExpectDelimiter) {
2747                    self.advance();
2748                    // Reset mode after consuming delimiter
2749                    self.mode = LexerMode::ExpectTerm;
2750                    Some(Token {
2751                        token_type: TokenType::Operator(Arc::from("#")),
2752                        text: Arc::from("#"),
2753                        start,
2754                        end: self.position,
2755                    })
2756                } else {
2757                    None
2758                }
2759            }
2760            _ => None,
2761        }
2762    }
2763
2764    fn parse_double_quoted_string(&mut self, start: usize) -> Option<Token> {
2765        self.advance(); // Skip opening quote
2766        let mut parts = Vec::new();
2767        let mut current_literal = String::new();
2768        let mut last_pos = self.position;
2769
2770        while let Some(ch) = self.current_char() {
2771            match ch {
2772                '"' => {
2773                    self.advance();
2774                    if !current_literal.is_empty() {
2775                        parts.push(StringPart::Literal(Arc::from(current_literal)));
2776                    }
2777
2778                    let text = &self.input[start..self.position];
2779                    self.mode = LexerMode::ExpectOperator;
2780
2781                    return Some(Token {
2782                        token_type: if parts.is_empty() {
2783                            TokenType::StringLiteral
2784                        } else {
2785                            TokenType::InterpolatedString(parts)
2786                        },
2787                        text: Arc::from(text),
2788                        start,
2789                        end: self.position,
2790                    });
2791                }
2792                '\\' => {
2793                    self.advance();
2794                    if let Some(escaped) = self.current_char() {
2795                        // Optimize by reserving space to avoid frequent reallocations
2796                        if current_literal.capacity() == 0 {
2797                            current_literal.reserve(32);
2798                        }
2799                        current_literal.push('\\');
2800                        current_literal.push(escaped);
2801                        self.advance();
2802                    }
2803                }
2804                '$' if self.config.parse_interpolation => {
2805                    // Handle variable interpolation - avoid unnecessary clone
2806                    if !current_literal.is_empty() {
2807                        parts.push(StringPart::Literal(Arc::from(current_literal)));
2808                        current_literal = String::new(); // Clear without cloning
2809                    }
2810
2811                    let part_start = self.position;
2812                    self.advance();
2813                    match self.current_char() {
2814                        Some('{') => {
2815                            let _ = self.consume_balanced_segment_in_string('{', '}', '"');
2816                            parts.push(StringPart::Expression(Arc::from(
2817                                &self.input[part_start..self.position],
2818                            )));
2819                        }
2820                        Some(ch) if is_perl_identifier_start(ch) => {
2821                            let var_start = self.position;
2822
2823                            // Fast path for ASCII identifier continuation
2824                            while self.position < self.input_bytes.len() {
2825                                let byte = self.input_bytes[self.position];
2826                                if byte.is_ascii_alphanumeric() || byte == b'_' {
2827                                    self.position += 1;
2828                                } else if byte >= 128 {
2829                                    // Only use UTF-8 parsing for non-ASCII
2830                                    if let Some(ch) = self.current_char() {
2831                                        if is_perl_identifier_continue(ch) {
2832                                            self.advance();
2833                                        } else {
2834                                            break;
2835                                        }
2836                                    } else {
2837                                        break;
2838                                    }
2839                                } else {
2840                                    break;
2841                                }
2842                            }
2843
2844                            if self.position > var_start {
2845                                let var_name = &self.input[part_start..self.position];
2846                                parts.push(StringPart::Variable(Arc::from(var_name)));
2847
2848                                if self.matches_bytes(b"->") {
2849                                    let tail_start = self.position;
2850                                    self.advance();
2851                                    self.advance();
2852
2853                                    match self.current_char() {
2854                                        Some('[') => {
2855                                            let _ = self
2856                                                .consume_balanced_segment_in_string('[', ']', '"');
2857                                            parts.push(StringPart::MethodCall(Arc::from(
2858                                                &self.input[tail_start..self.position],
2859                                            )));
2860                                        }
2861                                        Some('{') => {
2862                                            let _ = self
2863                                                .consume_balanced_segment_in_string('{', '}', '"');
2864                                            parts.push(StringPart::MethodCall(Arc::from(
2865                                                &self.input[tail_start..self.position],
2866                                            )));
2867                                        }
2868                                        Some('(') => {
2869                                            let _ = self
2870                                                .consume_balanced_segment_in_string('(', ')', '"');
2871                                            parts.push(StringPart::MethodCall(Arc::from(
2872                                                &self.input[tail_start..self.position],
2873                                            )));
2874                                        }
2875                                        Some(ch) if is_perl_identifier_start(ch) => {
2876                                            while self.position < self.input_bytes.len() {
2877                                                let byte = self.input_bytes[self.position];
2878                                                if byte.is_ascii_alphanumeric() || byte == b'_' {
2879                                                    self.position += 1;
2880                                                } else if byte >= 128 {
2881                                                    if let Some(ch) = self.current_char() {
2882                                                        if is_perl_identifier_continue(ch) {
2883                                                            self.advance();
2884                                                        } else {
2885                                                            break;
2886                                                        }
2887                                                    } else {
2888                                                        break;
2889                                                    }
2890                                                } else {
2891                                                    break;
2892                                                }
2893                                            }
2894                                            if self.current_char() == Some('(') {
2895                                                let _ = self.consume_balanced_segment_in_string(
2896                                                    '(', ')', '"',
2897                                                );
2898                                            }
2899                                            parts.push(StringPart::MethodCall(Arc::from(
2900                                                &self.input[tail_start..self.position],
2901                                            )));
2902                                        }
2903                                        _ => {
2904                                            parts.push(StringPart::MethodCall(Arc::from(
2905                                                &self.input[tail_start..self.position],
2906                                            )));
2907                                        }
2908                                    }
2909                                } else if self.current_char() == Some('[') {
2910                                    let tail_start = self.position;
2911                                    let _ = self.consume_balanced_segment_in_string('[', ']', '"');
2912                                    parts.push(StringPart::ArraySlice(Arc::from(
2913                                        &self.input[tail_start..self.position],
2914                                    )));
2915                                } else if self.current_char() == Some('{') {
2916                                    let tail_start = self.position;
2917                                    let _ = self.consume_balanced_segment_in_string('{', '}', '"');
2918                                    parts.push(StringPart::Expression(Arc::from(
2919                                        &self.input[tail_start..self.position],
2920                                    )));
2921                                }
2922                            }
2923                        }
2924                        _ => {}
2925                    }
2926                }
2927                _ => {
2928                    // Optimize string building with better capacity management
2929                    if current_literal.capacity() == 0 {
2930                        current_literal.reserve(32);
2931                    }
2932                    current_literal.push(ch);
2933                    self.advance();
2934                }
2935            }
2936
2937            // Safety check: ensure we're making progress
2938            if self.position == last_pos {
2939                break;
2940            }
2941            last_pos = self.position;
2942        }
2943
2944        Some(self.unterminated_string_error(start))
2945    }
2946
2947    fn parse_single_quoted_string(&mut self, start: usize) -> Option<Token> {
2948        self.advance(); // Skip opening quote
2949
2950        let mut last_pos = self.position;
2951
2952        while let Some(ch) = self.current_char() {
2953            match ch {
2954                '\'' => {
2955                    self.advance();
2956                    let text = &self.input[start..self.position];
2957                    self.mode = LexerMode::ExpectOperator;
2958
2959                    return Some(Token {
2960                        token_type: TokenType::StringLiteral,
2961                        text: Arc::from(text),
2962                        start,
2963                        end: self.position,
2964                    });
2965                }
2966                '\\' => {
2967                    self.advance();
2968                    if self.current_char() == Some('\'') || self.current_char() == Some('\\') {
2969                        self.advance();
2970                    }
2971                }
2972                _ => self.advance(),
2973            }
2974
2975            // Safety check: ensure we're making progress
2976            if self.position == last_pos {
2977                break;
2978            }
2979            last_pos = self.position;
2980        }
2981
2982        Some(self.unterminated_string_error(start))
2983    }
2984
2985    fn parse_backtick_string(&mut self, start: usize) -> Option<Token> {
2986        self.advance(); // Skip opening backtick
2987
2988        let mut last_pos = self.position;
2989
2990        while let Some(ch) = self.current_char() {
2991            match ch {
2992                '`' => {
2993                    self.advance();
2994                    let text = &self.input[start..self.position];
2995                    self.mode = LexerMode::ExpectOperator;
2996
2997                    return Some(Token {
2998                        token_type: TokenType::QuoteCommand,
2999                        text: Arc::from(text),
3000                        start,
3001                        end: self.position,
3002                    });
3003                }
3004                '\\' => {
3005                    self.advance();
3006                    if self.current_char().is_some() {
3007                        self.advance();
3008                    }
3009                }
3010                _ => self.advance(),
3011            }
3012
3013            // Safety check: ensure we're making progress
3014            if self.position == last_pos {
3015                break;
3016            }
3017            last_pos = self.position;
3018        }
3019
3020        Some(self.unterminated_string_error(start))
3021    }
3022
3023    fn parse_q_string(&mut self, _start: usize) -> Option<Token> {
3024        // Simplified q-string parsing
3025        None
3026    }
3027
3028    #[inline]
3029    fn unterminated_string_error(&mut self, start: usize) -> Token {
3030        // Consume to EOF so the caller receives a single terminal error token.
3031        let end = self.input.len();
3032        self.position = end;
3033
3034        Token {
3035            token_type: TokenType::Error(Arc::from("unterminated string")),
3036            text: Arc::from(&self.input[start..end]),
3037            start,
3038            end,
3039        }
3040    }
3041
3042    fn parse_substitution(&mut self, start: usize) -> Option<Token> {
3043        // We've already consumed 's'
3044        let delimiter = self.current_char()?;
3045        self.advance(); // Skip delimiter
3046        self.parse_substitution_with_delimiter(start, delimiter)
3047    }
3048
3049    fn parse_substitution_with_delimiter(
3050        &mut self,
3051        start: usize,
3052        delimiter: char,
3053    ) -> Option<Token> {
3054        self.read_delimited_body(delimiter);
3055
3056        let pattern_is_paired = quote_handler::paired_close(delimiter).is_some();
3057        if pattern_is_paired {
3058            while self.current_char().is_some_and(char::is_whitespace) {
3059                self.advance();
3060            }
3061
3062            if let Some(repl_delim) = self.current_char()
3063                && Self::is_quote_delim(repl_delim)
3064            {
3065                self.advance();
3066                self.read_delimited_body(repl_delim);
3067            }
3068        } else {
3069            self.read_delimited_body(delimiter);
3070        }
3071
3072        // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
3073        while let Some(ch) = self.current_char() {
3074            if ch.is_ascii_alphanumeric() {
3075                self.advance();
3076            } else {
3077                break;
3078            }
3079        }
3080
3081        let text = &self.input[start..self.position];
3082        self.mode = LexerMode::ExpectOperator;
3083
3084        Some(Token {
3085            token_type: TokenType::Substitution,
3086            text: Arc::from(text),
3087            start,
3088            end: self.position,
3089        })
3090    }
3091
3092    fn parse_transliteration(&mut self, start: usize) -> Option<Token> {
3093        // We've already consumed 'tr' or 'y'
3094        while self.current_char().is_some_and(char::is_whitespace) {
3095            self.advance();
3096        }
3097
3098        let delimiter = self.current_char()?;
3099        self.advance(); // Skip delimiter
3100        self.parse_transliteration_with_delimiter(start, delimiter)
3101    }
3102
3103    fn parse_transliteration_with_delimiter(
3104        &mut self,
3105        start: usize,
3106        delimiter: char,
3107    ) -> Option<Token> {
3108        self.read_delimited_body(delimiter);
3109
3110        let search_is_paired = quote_handler::paired_close(delimiter).is_some();
3111        if search_is_paired {
3112            while self.current_char().is_some_and(char::is_whitespace) {
3113                self.advance();
3114            }
3115
3116            if let Some(repl_delim) = self.current_char()
3117                && Self::is_quote_delim(repl_delim)
3118            {
3119                self.advance();
3120                self.read_delimited_body(repl_delim);
3121            }
3122        } else {
3123            self.read_delimited_body(delimiter);
3124        }
3125
3126        // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
3127        while let Some(ch) = self.current_char() {
3128            if ch.is_ascii_alphanumeric() {
3129                self.advance();
3130            } else {
3131                break;
3132            }
3133        }
3134
3135        let text = &self.input[start..self.position];
3136        self.mode = LexerMode::ExpectOperator;
3137
3138        Some(Token {
3139            token_type: TokenType::Transliteration,
3140            text: Arc::from(text),
3141            start,
3142            end: self.position,
3143        })
3144    }
3145
3146    /// Read content between delimiters.
3147    ///
3148    /// Returns `(body, closed)` where `closed` is `true` if the closing
3149    /// delimiter was found before EOF, and `false` if EOF was reached first.
3150    fn read_delimited_body(&mut self, delim: char) -> (String, bool) {
3151        let paired = quote_handler::paired_close(delim);
3152        let close = paired.unwrap_or(delim);
3153        let mut body = String::new();
3154        let mut depth = i32::from(paired.is_some());
3155
3156        while let Some(ch) = self.current_char() {
3157            if ch == '\\' {
3158                body.push(ch);
3159                self.advance();
3160                if let Some(next) = self.current_char() {
3161                    body.push(next);
3162                    self.advance();
3163                }
3164                continue;
3165            }
3166
3167            if paired.is_some() && ch == delim {
3168                body.push(ch);
3169                self.advance();
3170                depth += 1;
3171                continue;
3172            }
3173
3174            if ch == close {
3175                if paired.is_some() {
3176                    depth -= 1;
3177                    if depth == 0 {
3178                        self.advance();
3179                        return (body, true);
3180                    }
3181                    body.push(ch);
3182                    self.advance();
3183                } else {
3184                    self.advance();
3185                    return (body, true);
3186                }
3187                continue;
3188            }
3189
3190            body.push(ch);
3191            self.advance();
3192        }
3193
3194        // EOF reached without finding the closing delimiter
3195        (body, false)
3196    }
3197
3198    /// Parse a quote operator after we've seen the delimiter
3199    fn parse_quote_operator(&mut self, delimiter: char) -> Option<Token> {
3200        let info = self.current_quote_op.as_ref()?;
3201        let start = info.start_pos;
3202        let operator = info.operator.clone();
3203
3204        // Clear the quote-op context eagerly so any early-return path (s/tr/y delegations
3205        // below) does not leave a stale reference behind. The post-match cleanup at the
3206        // bottom of this function would otherwise be skipped for those operators.
3207        self.current_quote_op = None;
3208
3209        // Parse based on operator type; track whether all delimiters were closed.
3210        let closed = match operator.as_str() {
3211            "s" => {
3212                return self.parse_substitution_with_delimiter(start, delimiter);
3213            }
3214            "tr" | "y" => {
3215                return self.parse_transliteration_with_delimiter(start, delimiter);
3216            }
3217            "qr" => {
3218                let (_pattern, body_closed) = self.read_delimited_body(delimiter);
3219                self.parse_regex_modifiers(&quote_handler::QR_SPEC);
3220                body_closed
3221            }
3222            "m" => {
3223                let (_pattern, body_closed) = self.read_delimited_body(delimiter);
3224                self.parse_regex_modifiers(&quote_handler::M_SPEC);
3225                body_closed
3226            }
3227            _ => {
3228                // q, qq, qw, qx - no modifiers
3229                let (_body, body_closed) = self.read_delimited_body(delimiter);
3230                body_closed
3231            }
3232        };
3233
3234        let text = &self.input[start..self.position];
3235
3236        self.mode = LexerMode::ExpectOperator;
3237
3238        if !closed {
3239            // EOF reached before finding the closing delimiter — emit an error
3240            // token so the parser's recovery mechanism records a diagnostic.
3241            return Some(Token {
3242                token_type: TokenType::Error(Arc::from(format!(
3243                    "unclosed {} delimiter '{}'",
3244                    operator, delimiter
3245                ))),
3246                text: Arc::from(text),
3247                start,
3248                end: self.position,
3249            });
3250        }
3251
3252        let token_type = quote_handler::get_quote_token_type(&operator);
3253        Some(Token { token_type, text: Arc::from(text), start, end: self.position })
3254    }
3255
3256    /// Parse regex modifiers according to the given spec
3257    ///
3258    /// This function includes ALL characters that could be intended as modifiers,
3259    /// including invalid ones. This allows the parser to properly reject invalid
3260    /// modifiers with a clear error message, rather than leaving them as separate
3261    /// tokens that could be confusingly parsed.
3262    fn parse_regex_modifiers(&mut self, _spec: &quote_handler::ModSpec) {
3263        // Consume all alphanumeric characters that could be intended as modifiers
3264        // The parser will validate and reject invalid ones
3265        while let Some(ch) = self.current_char() {
3266            if ch.is_ascii_alphanumeric() {
3267                self.advance();
3268            } else {
3269                break;
3270            }
3271        }
3272        // Note: We no longer validate here - the parser will validate and provide
3273        // clear error messages for invalid modifiers (MUT_005 fix)
3274    }
3275
3276    /// Parse a regex literal starting with `/`
3277    ///
3278    /// **Budget Protection (Issue #422)**:
3279    /// - Budget guards prevent runaway scanning on pathological input
3280    /// - `MAX_REGEX_PARSE_STEPS` bounds literal scanning before the byte budget
3281    /// - `MAX_REGEX_BYTES` bounds total bytes consumed in a single regex literal
3282    /// - Graceful degradation: emit UnknownRest token if budget exceeded
3283    ///
3284    /// **Performance**:
3285    /// - Single-pass scanning with escape handling
3286    /// - Budget check per iteration (amortized O(1) via inline fast path)
3287    /// - Typical regex: <10μs, Large regex (64KB): ~1ms
3288    fn parse_regex(&mut self, start: usize) -> Option<Token> {
3289        self.advance(); // Skip opening /
3290
3291        let mut regex_parse_steps: usize = 0;
3292        let mut in_character_class = false;
3293
3294        while let Some(ch) = self.current_char() {
3295            regex_parse_steps += 1;
3296            if regex_parse_steps > MAX_REGEX_PARSE_STEPS {
3297                #[cfg(debug_assertions)]
3298                {
3299                    let text = &self.input[start..self.position];
3300                    let preview = truncate_preview(text, 50);
3301                    tracing::debug!(
3302                        limit = MAX_REGEX_PARSE_STEPS,
3303                        pattern_preview = %preview,
3304                        "Regex parse step budget exceeded"
3305                    );
3306                }
3307                self.position = self.input.len();
3308                return Some(Token {
3309                    token_type: TokenType::UnknownRest,
3310                    text: empty_arc(),
3311                    start,
3312                    end: self.position,
3313                });
3314            }
3315
3316            // Budget guard: prevent timeout on pathological input (Issue #422)
3317            // If exceeded, returns UnknownRest token for graceful degradation
3318            if let Some(token) = self.budget_guard(start, 0) {
3319                return Some(token);
3320            }
3321
3322            match ch {
3323                '/' if !in_character_class => {
3324                    self.advance();
3325                    // Parse flags - include all alphanumeric for proper validation in parser (MUT_005 fix)
3326                    while let Some(ch) = self.current_char() {
3327                        if ch.is_ascii_alphanumeric() {
3328                            self.advance();
3329                        } else {
3330                            break;
3331                        }
3332                    }
3333
3334                    let text = &self.input[start..self.position];
3335                    self.mode = LexerMode::ExpectOperator;
3336
3337                    return Some(Token {
3338                        token_type: TokenType::RegexMatch,
3339                        text: Arc::from(text),
3340                        start,
3341                        end: self.position,
3342                    });
3343                }
3344                '\\' => {
3345                    // Handle escape sequences: consume backslash + next char
3346                    self.advance();
3347                    if self.current_char().is_some() {
3348                        self.advance();
3349                    }
3350                }
3351                '[' => {
3352                    in_character_class = true;
3353                    self.advance();
3354                }
3355                ']' if in_character_class => {
3356                    in_character_class = false;
3357                    self.advance();
3358                }
3359                _ => self.advance(),
3360            }
3361        }
3362
3363        // Unterminated regex - EOF reached before closing /
3364        // Parser will emit diagnostic for unterminated literal
3365        None
3366    }
3367}
3368
3369// Pre-allocated empty Arc to avoid repeated allocations
3370static EMPTY_ARC: OnceLock<Arc<str>> = OnceLock::new();
3371
3372#[inline(always)]
3373fn empty_arc() -> Arc<str> {
3374    EMPTY_ARC.get_or_init(|| Arc::from("")).clone()
3375}
3376
3377fn truncate_preview(text: &str, max_chars: usize) -> String {
3378    match text.char_indices().nth(max_chars) {
3379        Some((idx, _)) => format!("{}...", &text[..idx]),
3380        None => text.to_string(),
3381    }
3382}
3383
3384#[inline(always)]
3385fn is_keyword_fast(word: &str) -> bool {
3386    // Fast length-based rejection for most cases.
3387    // Lexer keywords are currently bounded to 1..=9 characters.
3388    matches!(word.len(), 1..=9) && is_lexer_keyword(word)
3389}
3390
3391#[inline]
3392fn is_builtin_function(word: &str) -> bool {
3393    BARE_TERM_BUILTINS.binary_search(&word).is_ok()
3394}
3395
3396#[inline(always)]
3397fn is_quote_op_word_prefix(word: &[u8]) -> bool {
3398    matches!(word, b"m" | b"q" | b"qq" | b"qw" | b"qx" | b"qr")
3399}
3400
3401const BARE_TERM_BUILTINS: &[&str] = &[
3402    "abs", "chomp", "chop", "chr", "close", "defined", "delete", "each", "exists", "hex", "int",
3403    "join", "keys", "lc", "lcfirst", "length", "oct", "open", "ord", "pack", "print", "push",
3404    "read", "ref", "reverse", "rindex", "say", "scalar", "splice", "sprintf", "sqrt", "substr",
3405    "tie", "uc", "ucfirst", "unpack", "unshift", "untie", "values", "write",
3406];
3407
3408/// Fast lookup table for compound operator second characters
3409const COMPOUND_SECOND_CHARS: &[u8] = b"=<>&|+->.~*:";
3410
3411#[inline]
3412fn is_compound_operator(first: char, second: char) -> bool {
3413    // Optimized compound operator lookup using perfect hashing for common cases
3414    // Convert to bytes for faster comparison (most operators are ASCII)
3415    if first.is_ascii() && second.is_ascii() {
3416        let first_byte = first as u8;
3417        let second_byte = second as u8;
3418
3419        if !COMPOUND_SECOND_CHARS.contains(&second_byte) {
3420            return false;
3421        }
3422
3423        // Use lookup table approach for maximum performance
3424        match (first_byte, second_byte) {
3425            // Assignment operators
3426            (b'+' | b'-' | b'*' | b'/' | b'%' | b'&' | b'|' | b'^' | b'.', b'=') => true,
3427
3428            // Comparison operators
3429            (b'<' | b'>' | b'=' | b'!', b'=') => true,
3430
3431            // Pattern operators
3432            (b'=' | b'!', b'~') => true,
3433
3434            // Increment/decrement
3435            (b'+', b'+') | (b'-', b'-') => true,
3436
3437            // Logical operators
3438            (b'&', b'&') | (b'|', b'|') => true,
3439
3440            // Shift operators
3441            (b'<', b'<') | (b'>', b'>') => true,
3442
3443            // Other compound operators
3444            (b'*', b'*')
3445            | (b'/', b'/')
3446            | (b'-' | b'=', b'>')
3447            | (b'.', b'.')
3448            | (b'~', b'~')
3449            | (b':', b':') => true,
3450
3451            _ => false,
3452        }
3453    } else {
3454        // Fallback for non-ASCII (should be rare)
3455        matches!(
3456            (first, second),
3457            ('+' | '-' | '*' | '/' | '%' | '&' | '|' | '^' | '.' | '<' | '>' | '=' | '!', '=')
3458                | ('=' | '!' | '~', '~')
3459                | ('+', '+')
3460                | ('-', '-' | '>')
3461                | ('&', '&')
3462                | ('|', '|')
3463                | ('<', '<')
3464                | ('>' | '=', '>')
3465                | ('*', '*')
3466                | ('/', '/')
3467                | ('.', '.')
3468                | (':', ':')
3469        )
3470    }
3471}
3472
3473// Checkpoint support for incremental parsing
3474impl Checkpointable for PerlLexer<'_> {
3475    fn checkpoint(&self) -> LexerCheckpoint {
3476        use checkpoint::CheckpointContext;
3477
3478        // Determine the checkpoint context based on current state
3479        let context = if matches!(self.mode, LexerMode::InFormatBody) {
3480            CheckpointContext::Format {
3481                start_position: self.position.saturating_sub(100), // Approximate
3482            }
3483        } else if !self.delimiter_stack.is_empty() {
3484            // We're in some kind of quote-like construct
3485            CheckpointContext::QuoteLike {
3486                operator: String::new(), // Would need to track this
3487                delimiter: self.delimiter_stack.last().copied().unwrap_or('\0'),
3488                is_paired: true,
3489            }
3490        } else {
3491            CheckpointContext::Normal
3492        };
3493
3494        LexerCheckpoint {
3495            position: self.position,
3496            mode: self.mode,
3497            delimiter_stack: self.delimiter_stack.clone(),
3498            in_prototype: self.in_prototype,
3499            prototype_depth: self.prototype_depth,
3500            after_sub: self.after_sub,
3501            after_arrow: self.after_arrow,
3502            hash_brace_depth: self.hash_brace_depth,
3503            after_var_subscript: self.after_var_subscript,
3504            paren_depth: self.paren_depth,
3505            current_pos: self.current_pos,
3506            context,
3507        }
3508    }
3509
3510    fn restore(&mut self, checkpoint: &LexerCheckpoint) {
3511        self.position = checkpoint.position;
3512        self.mode = checkpoint.mode;
3513        self.delimiter_stack.clone_from(&checkpoint.delimiter_stack);
3514        self.in_prototype = checkpoint.in_prototype;
3515        self.prototype_depth = checkpoint.prototype_depth;
3516        self.after_sub = checkpoint.after_sub;
3517        self.after_arrow = checkpoint.after_arrow;
3518        self.hash_brace_depth = checkpoint.hash_brace_depth;
3519        self.after_var_subscript = checkpoint.after_var_subscript;
3520        self.paren_depth = checkpoint.paren_depth;
3521        self.current_pos = checkpoint.current_pos;
3522
3523        // Handle special contexts
3524        use checkpoint::CheckpointContext;
3525        if let CheckpointContext::Format { .. } = &checkpoint.context {
3526            // Ensure we're in format body mode
3527            if !matches!(self.mode, LexerMode::InFormatBody) {
3528                self.mode = LexerMode::InFormatBody;
3529            }
3530        }
3531    }
3532
3533    fn can_restore(&self, checkpoint: &LexerCheckpoint) -> bool {
3534        // Can restore if the position is valid for our input
3535        checkpoint.position <= self.input.len()
3536    }
3537}
3538
3539#[cfg(test)]
3540mod test_format_debug;
3541
3542#[cfg(test)]
3543mod tests {
3544    use super::*;
3545
3546    type TestResult = std::result::Result<(), Box<dyn std::error::Error>>;
3547
3548    #[test]
3549    fn test_basic_tokens() -> TestResult {
3550        let mut lexer = PerlLexer::new("my $x = 42;");
3551
3552        let token = lexer.next_token().ok_or("Expected keyword token")?;
3553        assert_eq!(token.token_type, TokenType::Keyword(Arc::from("my")));
3554
3555        let token = lexer.next_token().ok_or("Expected identifier token")?;
3556        assert!(matches!(token.token_type, TokenType::Identifier(_)));
3557
3558        let token = lexer.next_token().ok_or("Expected operator token")?;
3559        assert!(matches!(token.token_type, TokenType::Operator(_)));
3560
3561        let token = lexer.next_token().ok_or("Expected number token")?;
3562        assert!(matches!(token.token_type, TokenType::Number(_)));
3563
3564        let token = lexer.next_token().ok_or("Expected semicolon token")?;
3565        assert_eq!(token.token_type, TokenType::Semicolon);
3566        Ok(())
3567    }
3568
3569    #[test]
3570    fn test_slash_disambiguation() -> TestResult {
3571        // Division
3572        let mut lexer = PerlLexer::new("10 / 2");
3573        lexer.next_token(); // 10
3574        let token = lexer.next_token().ok_or("Expected division token")?;
3575        assert_eq!(token.token_type, TokenType::Division);
3576
3577        // Regex
3578        let mut lexer = PerlLexer::new("if (/pattern/)");
3579        lexer.next_token(); // if
3580        lexer.next_token(); // (
3581        let token = lexer.next_token().ok_or("Expected regex token")?;
3582        assert_eq!(token.token_type, TokenType::RegexMatch);
3583        Ok(())
3584    }
3585
3586    #[test]
3587    fn test_percent_and_double_sigil_disambiguation() -> TestResult {
3588        // Hash variable
3589        let mut lexer = PerlLexer::new("%hash");
3590        let token = lexer.next_token().ok_or("Expected hash identifier token")?;
3591        assert!(
3592            matches!(token.token_type, TokenType::Identifier(ref id) if id.as_ref() == "%hash")
3593        );
3594
3595        // Modulo operator
3596        let mut lexer = PerlLexer::new("10 % 3");
3597        lexer.next_token(); // 10
3598        let token = lexer.next_token().ok_or("Expected modulo operator token")?;
3599        assert!(matches!(token.token_type, TokenType::Operator(ref op) if op.as_ref() == "%"));
3600        Ok(())
3601    }
3602
3603    #[test]
3604    fn test_defined_or_and_exponent() -> TestResult {
3605        // Defined-or operator
3606        let mut lexer = PerlLexer::new("$a // $b");
3607        lexer.next_token(); // $a
3608        let token = lexer.next_token().ok_or("Expected defined-or operator token")?;
3609        assert!(matches!(token.token_type, TokenType::Operator(ref op) if op.as_ref() == "//"));
3610
3611        // Regex after =~ should still parse
3612        let mut lexer = PerlLexer::new("$x =~ //");
3613        lexer.next_token(); // $x
3614        lexer.next_token(); // =~
3615        let token = lexer.next_token().ok_or("Expected regex token")?;
3616        assert_eq!(token.token_type, TokenType::RegexMatch);
3617
3618        // Exponent operator
3619        let mut lexer = PerlLexer::new("2 ** 3");
3620        lexer.next_token(); // 2
3621        let token = lexer.next_token().ok_or("Expected exponent operator token")?;
3622        assert!(matches!(token.token_type, TokenType::Operator(ref op) if op.as_ref() == "**"));
3623        Ok(())
3624    }
3625
3626    #[test]
3627    fn test_join_regex_disambiguation() -> TestResult {
3628        let mut lexer = PerlLexer::new("join /,/, @parts");
3629        let token = lexer.next_token().ok_or("Expected join token")?;
3630        assert!(matches!(token.token_type, TokenType::Identifier(ref id) if id.as_ref() == "join"));
3631
3632        let token = lexer.next_token().ok_or("Expected regex token")?;
3633        assert_eq!(token.token_type, TokenType::RegexMatch);
3634        Ok(())
3635    }
3636
3637    #[test]
3638    fn test_builtin_regex_disambiguation() -> TestResult {
3639        for code in ["print /pattern/", "defined /pattern/", "keys /pattern/"] {
3640            let mut lexer = PerlLexer::new(code);
3641            lexer.next_token();
3642            let token = lexer.next_token().ok_or("Expected regex token")?;
3643            assert_eq!(token.token_type, TokenType::RegexMatch, "{code}");
3644        }
3645        Ok(())
3646    }
3647
3648    #[test]
3649    fn test_nullary_builtin_division_disambiguation() -> TestResult {
3650        let mut lexer = PerlLexer::new("time / 2");
3651        let token = lexer.next_token().ok_or("Expected time token")?;
3652        assert!(matches!(token.token_type, TokenType::Identifier(ref id) if id.as_ref() == "time"));
3653
3654        let token = lexer.next_token().ok_or("Expected division token")?;
3655        assert_eq!(token.token_type, TokenType::Division);
3656        Ok(())
3657    }
3658
3659    #[test]
3660    fn test_peek_token_does_not_mutate_paren_depth() -> TestResult {
3661        // Regression guard for issue #2750: peek_token() must save and restore
3662        // paren_depth so that a peek at `(` does not permanently increment
3663        // paren_depth and corrupt the heredoc/bitshift guard on a subsequent token.
3664        let mut lexer = PerlLexer::new("(1<<2)");
3665        assert_eq!(lexer.paren_depth, 0, "paren_depth must start at 0");
3666
3667        // Peek at `(` — must not permanently increment paren_depth
3668        let peeked = lexer.peek_token().ok_or("peek at ( failed")?;
3669        assert_eq!(peeked.token_type, TokenType::LeftParen);
3670        assert_eq!(lexer.paren_depth, 0, "peek_token must not mutate paren_depth");
3671
3672        // Consume `(` — paren_depth becomes 1
3673        lexer.next_token();
3674        assert_eq!(lexer.paren_depth, 1);
3675
3676        // Peek at `1` (a number) — paren_depth must remain 1
3677        let peeked2 = lexer.peek_token().ok_or("peek at 1 failed")?;
3678        assert!(matches!(peeked2.token_type, TokenType::Number(_)));
3679        assert_eq!(lexer.paren_depth, 1, "peek at number must not change paren_depth");
3680
3681        Ok(())
3682    }
3683
3684    #[test]
3685    fn test_comment_skipping_with_cr_line_endings() -> TestResult {
3686        let mut lexer = PerlLexer::new("my $x = 1;# comment\rmy $y = 2;");
3687        let mut saw_second_my = false;
3688
3689        while let Some(token) = lexer.next_token() {
3690            if matches!(token.token_type, TokenType::EOF) {
3691                break;
3692            }
3693
3694            if matches!(token.token_type, TokenType::Keyword(ref kw) if kw.as_ref() == "my")
3695                && token.start > 0
3696            {
3697                saw_second_my = true;
3698            }
3699        }
3700
3701        assert!(saw_second_my, "lexer should continue after CR-terminated comment line");
3702        Ok(())
3703    }
3704
3705    #[test]
3706    fn test_pod_skipped_with_cr_only_line_endings() -> TestResult {
3707        // CR-only line endings (classic Mac): =pod and =cut must be detected
3708        // when preceded by \r instead of \n.
3709        let input = "my $before = 1;\r=pod\rThis is documentation.\r=cut\rmy $after = 2;";
3710        let mut lexer = PerlLexer::new(input);
3711        let mut token_texts: Vec<String> = Vec::new();
3712
3713        while let Some(token) = lexer.next_token() {
3714            if matches!(token.token_type, TokenType::EOF) {
3715                break;
3716            }
3717            if matches!(token.token_type, TokenType::Keyword(_) | TokenType::Identifier(_)) {
3718                token_texts.push(token.text.to_string());
3719            }
3720        }
3721
3722        assert!(
3723            token_texts.iter().any(|t| t == "my" && {
3724                // find the second 'my' (after the POD block)
3725                token_texts.iter().enumerate().filter(|(_, t)| t.as_str() == "my").nth(1).is_some()
3726            }),
3727            "lexer should produce tokens after CR-terminated =cut; got: {:?}",
3728            token_texts
3729        );
3730
3731        // Ensure POD body text is not present as an identifier token
3732        assert!(
3733            !token_texts.iter().any(|t| t == "documentation"),
3734            "POD body should be consumed, not emitted as a token; got: {:?}",
3735            token_texts
3736        );
3737        Ok(())
3738    }
3739
3740    #[test]
3741    fn test_exponent_sign_no_digits_plus() -> TestResult {
3742        // .5e+x — 'e' is not a valid exponent (no digits follow), so the number
3743        // token must be ".5" only.  The 'e' becomes a separate identifier token.
3744        // Regression: old code produced Number(".5e") by backtracking to the sign
3745        // character instead of to the 'e' itself.
3746        let mut lexer = PerlLexer::new(".5e+x");
3747        let tok1 = lexer.next_token().ok_or("expected first token")?;
3748        assert!(
3749            matches!(&tok1.token_type, TokenType::Number(n) if n.as_ref() == ".5"),
3750            "expected Number(\".5\") but got {:?}",
3751            tok1.token_type
3752        );
3753        // The 'e' must NOT be swallowed into the number token.
3754        let tok2 = lexer.next_token().ok_or("expected second token")?;
3755        assert!(
3756            !matches!(&tok2.token_type, TokenType::Number(_)),
3757            "number token must not include 'e'; second token should not be a Number, got {:?}",
3758            tok2.token_type
3759        );
3760        Ok(())
3761    }
3762
3763    #[test]
3764    fn test_exponent_sign_no_digits_minus() -> TestResult {
3765        // 1.5e-y — 'e' is not a valid exponent (no digits follow), so the number
3766        // token must be "1.5" only.  The 'e' becomes a separate identifier token.
3767        // Regression: old code produced Number("1.5e") by backtracking to the '-'
3768        // character instead of to the 'e' itself.
3769        let mut lexer = PerlLexer::new("1.5e-y");
3770        let tok1 = lexer.next_token().ok_or("expected first token")?;
3771        assert!(
3772            matches!(&tok1.token_type, TokenType::Number(n) if n.as_ref() == "1.5"),
3773            "expected Number(\"1.5\") but got {:?}",
3774            tok1.token_type
3775        );
3776        // The 'e' must NOT be swallowed into the number token.
3777        let tok2 = lexer.next_token().ok_or("expected second token")?;
3778        assert!(
3779            !matches!(&tok2.token_type, TokenType::Number(_)),
3780            "number token must not include 'e'; second token should not be a Number, got {:?}",
3781            tok2.token_type
3782        );
3783        Ok(())
3784    }
3785}
perl_lexer/lib.rs

perl_lexer/
lib.rs