perl_lexer/
lib.rs

1//! Context-aware Perl lexer with mode-based tokenization
2//!
3//! This crate provides a high-performance lexer for Perl that handles the inherently
4//! context-sensitive nature of the language. The lexer uses a mode-tracking system to
5//! correctly disambiguate ambiguous syntax like `/` (division vs. regex) and properly
6//! parse complex constructs like heredocs, quote-like operators, and nested delimiters.
7//!
8//! # Architecture
9//!
10//! The lexer is organized around several key concepts:
11//!
12//! - **Mode Tracking**: [`LexerMode`] tracks whether the parser expects a term or an operator,
13//!   enabling correct disambiguation of context-sensitive tokens.
14//! - **Checkpointing**: [`LexerCheckpoint`] and [`Checkpointable`] support incremental parsing
15//!   by allowing the lexer state to be saved and restored.
16//! - **Budget Limits**: Protection against pathological input with configurable size limits
17//!   for regex patterns, heredoc bodies, and delimiter nesting depth.
18//! - **Position Tracking**: [`Position`] maintains line/column information for error reporting
19//!   and LSP integration.
20//! - **Unicode Support**: Full Unicode identifier support following Perl 5.14+ semantics.
21//!
22//! # Usage
23//!
24//! ## Basic Tokenization
25//!
26//! ```rust
27//! use perl_lexer::{PerlLexer, TokenType};
28//!
29//! let mut lexer = PerlLexer::new("my $x = 42;");
30//! let tokens = lexer.collect_tokens();
31//!
32//! // First token is the keyword `my`
33//! assert!(matches!(&tokens[0].token_type, TokenType::Keyword(k) if &**k == "my"));
34//! // Tokens include variables, operators, literals, and EOF
35//! assert!(matches!(&tokens.last().map(|t| &t.token_type), Some(TokenType::EOF)));
36//! ```
37//!
38//! ## Context-Aware Parsing
39//!
40//! The lexer automatically tracks context to disambiguate operators:
41//!
42//! ```rust
43//! use perl_lexer::{PerlLexer, TokenType};
44//!
45//! // Division operator (after a term)
46//! let mut lexer = PerlLexer::new("42 / 2");
47//! // Regex operator (at start of expression)
48//! let mut lexer2 = PerlLexer::new("/pattern/");
49//! ```
50//!
51//! ## Checkpointing for Incremental Parsing
52//!
53//! ```rust,ignore
54//! use perl_lexer::{PerlLexer, Checkpointable};
55//!
56//! let mut lexer = PerlLexer::new("my $x = 1;");
57//! let checkpoint = lexer.checkpoint();
58//!
59//! // Parse some tokens
60//! let _ = lexer.next_token();
61//!
62//! // Restore to checkpoint
63//! lexer.restore(&checkpoint);
64//! ```
65//!
66//! ## Configuration Options
67//!
68//! ```rust
69//! use perl_lexer::{PerlLexer, LexerConfig};
70//!
71//! let config = LexerConfig {
72//!     parse_interpolation: true,  // Parse string interpolation
73//!     track_positions: true,      // Track line/column positions
74//!     max_lookahead: 1024,        // Maximum lookahead for disambiguation
75//! };
76//!
77//! let mut lexer = PerlLexer::with_config("my $x = 1;", config);
78//! ```
79//!
80//! # Context Sensitivity Examples
81//!
82//! Perl's grammar is highly context-sensitive. The lexer handles these cases:
83//!
84//! - **Division vs. Regex**: `/` is division after terms, regex at expression start
85//! - **Modulo vs. Hash Sigil**: `%` is modulo after terms, hash sigil at expression start
86//! - **Glob vs. Exponent**: `**` can be exponentiation or glob pattern start
87//! - **Defined-or vs. Regex**: `//` is defined-or after terms, regex at expression start
88//! - **Heredoc Markers**: `<<` can be left shift, here-doc, or numeric less-than-less-than
89//!
90//! # Budget Limits
91//!
92//! To prevent hangs on pathological input, the lexer enforces these limits:
93//!
94//! - **MAX_REGEX_BYTES**: 64KB maximum for regex patterns
95//! - **MAX_HEREDOC_BYTES**: 256KB maximum for heredoc bodies
96//! - **MAX_DELIM_NEST**: 128 levels maximum nesting depth for delimiters
97//! - **MAX_REGEX_PARSE_STEPS**: 32K maximum scan iterations for regex literals
98//!
99//! When limits are exceeded, the lexer emits an `UnknownRest` token preserving
100//! all previously parsed symbols, allowing continued analysis.
101//!
102//! # Integration with perl-parser
103//!
104//! The lexer is designed to work seamlessly with `perl_parser_core::Parser`.
105//! You rarely need to use the lexer directly -- the parser creates and manages
106//! a `PerlLexer` instance internally:
107//!
108//! ```rust,ignore
109//! use perl_parser_core::Parser;
110//!
111//! let code = r#"sub hello { print "Hello, world!\n"; }"#;
112//! let mut parser = Parser::new(code);
113//! let ast = parser.parse().expect("should parse");
114//! ```
115
116#![allow(
117    // Core allows for lexer code
118    clippy::too_many_lines,
119    clippy::module_name_repetitions,
120    clippy::cast_possible_truncation,
121    clippy::cast_sign_loss,
122    clippy::cast_possible_wrap,
123    clippy::cast_precision_loss,
124    clippy::must_use_candidate,
125    clippy::missing_errors_doc,
126    clippy::missing_panics_doc,
127
128    // Lexer-specific patterns that are fine
129    clippy::match_same_arms,
130    clippy::redundant_else,
131    clippy::unnecessary_wraps,
132    clippy::unused_self,
133    clippy::items_after_statements,
134    clippy::struct_excessive_bools,
135    clippy::uninlined_format_args
136)]
137
138use std::sync::{Arc, OnceLock};
139
140pub mod api;
141pub mod builtins;
142pub mod checkpoint;
143pub mod config;
144pub mod error;
145mod heredoc;
146pub mod keywords;
147mod lexer;
148pub mod limits;
149pub mod mode;
150mod quote_handler;
151pub mod token;
152pub mod tokenizer;
153mod unicode;
154
155pub use api::*;
156pub use checkpoint::{CheckpointCache, Checkpointable, LexerCheckpoint};
157pub use config::LexerConfig;
158pub use error::{LexerError, Result};
159pub use lexer::PerlLexer;
160pub use limits::MAX_REGEX_PARSE_STEPS;
161pub use mode::LexerMode;
162pub use perl_position_tracking::Position;
163pub use token::{StringPart, Token, TokenType};
164
165use unicode::{is_perl_identifier_continue, is_perl_identifier_start};
166
167use crate::heredoc::HeredocSpec;
168use crate::limits::{
169    HEREDOC_TIMEOUT_MS, MAX_DELIM_NEST, MAX_HEREDOC_BYTES, MAX_HEREDOC_DEPTH, MAX_REGEX_BYTES,
170};
171
172impl<'a> PerlLexer<'a> {
173    /// Create a new lexer that emits `HeredocBody` tokens (for LSP folding)
174    pub fn with_body_tokens(input: &'a str) -> Self {
175        let mut lexer = Self::new(input);
176        lexer.emit_heredoc_body_tokens = true;
177        lexer
178    }
179
180    /// Set the lexer mode (for resetting state at statement boundaries)
181    pub fn set_mode(&mut self, mode: LexerMode) {
182        self.mode = mode;
183    }
184
185    /// Advance the lexer and return the next token.
186    ///
187    /// Returns `None` only after an `EOF` token has already been emitted.
188    /// The final meaningful call returns `Some(Token { token_type: TokenType::EOF, .. })`.
189    pub fn next_token(&mut self) -> Option<Token> {
190        // Normalize file start (BOM) once
191        if self.position == 0 {
192            self.normalize_file_start();
193        }
194
195        // Loop to avoid recursion when processing heredocs
196        loop {
197            // Handle format body parsing if we're in that mode
198            if matches!(self.mode, LexerMode::InFormatBody) {
199                return self.parse_format_body();
200            }
201
202            // Handle data section parsing if we're in that mode
203            if matches!(self.mode, LexerMode::InDataSection) {
204                return self.parse_data_body();
205            }
206
207            // Check if we're inside a heredoc body BEFORE skipping whitespace
208            let mut found_terminator = false;
209            if !self.pending_heredocs.is_empty() {
210                // Clone what we need to avoid holding a borrow
211                let (body_start, label, allow_indent) =
212                    if let Some(spec) = self.pending_heredocs.first() {
213                        if spec.body_start > 0
214                            && self.position >= spec.body_start
215                            && self.position < self.input.len()
216                        {
217                            (spec.body_start, spec.label.clone(), spec.allow_indent)
218                        } else {
219                            // Not in a heredoc body yet or at EOF
220                            (0, empty_arc(), false)
221                        }
222                    } else {
223                        (0, empty_arc(), false)
224                    };
225
226                if body_start > 0 {
227                    // We're inside a heredoc body - scan for the terminator
228
229                    // Scan line by line looking for the terminator
230                    while self.position < self.input.len() {
231                        // Timeout protection (Issue #443)
232                        if self.start_time.elapsed().as_millis() > HEREDOC_TIMEOUT_MS as u128 {
233                            self.pending_heredocs.remove(0);
234                            self.position = self.input.len();
235                            return Some(Token {
236                                token_type: TokenType::Error(Arc::from("Heredoc parsing timeout")),
237                                text: Arc::from(&self.input[body_start..]),
238                                start: body_start,
239                                end: self.input.len(),
240                            });
241                        }
242
243                        // Budget cap for huge bodies - optimized check
244                        if self.position - body_start > MAX_HEREDOC_BYTES {
245                            // Remove the pending heredoc to avoid infinite loop
246                            self.pending_heredocs.remove(0);
247                            self.position = self.input.len();
248                            return Some(Token {
249                                token_type: TokenType::UnknownRest,
250                                text: Arc::from(&self.input[body_start..]),
251                                start: body_start,
252                                end: self.input.len(),
253                            });
254                        }
255
256                        // Skip to start of next line if not at line start
257                        // Exception: if we're at body_start exactly, we're at the heredoc body start
258                        if !self.after_newline && self.position != body_start {
259                            while self.position < self.input.len()
260                                && self.input_bytes[self.position] != b'\n'
261                                && self.input_bytes[self.position] != b'\r'
262                            {
263                                self.advance();
264                            }
265                            self.consume_newline();
266                            continue;
267                        }
268
269                        // We're at line start - check if this line is the terminator
270                        let line_start = self.position;
271                        let (line_end, line_visible_end) =
272                            Self::find_line_end(self.input_bytes, self.position);
273                        let line = &self.input[line_start..line_visible_end];
274                        // Strip trailing spaces/tabs (Perl allows them)
275                        let trimmed_end = line.trim_end_matches([' ', '\t']);
276
277                        // Check if this line is the terminator
278                        let is_terminator = if allow_indent {
279                            // Allow any leading spaces/tabs before the label
280                            let mut p = 0;
281                            while p < trimmed_end.len() {
282                                let b = trimmed_end.as_bytes()[p];
283                                if b == b' ' || b == b'\t' {
284                                    p += 1;
285                                } else {
286                                    break;
287                                }
288                            }
289                            trimmed_end[p..] == *label
290                        } else {
291                            // Must start at column 0 (no leading whitespace)
292                            // The terminator is just the label (already trimmed trailing whitespace)
293                            trimmed_end == &*label
294                        };
295
296                        if is_terminator {
297                            // Found the terminator!
298                            self.pending_heredocs.remove(0);
299                            found_terminator = true;
300
301                            // Consume past the terminator line
302                            self.position = line_end;
303                            self.consume_newline();
304
305                            // Set body_start for the next pending heredoc (if any)
306                            if let Some(next) = self.pending_heredocs.first_mut()
307                                && next.body_start == 0
308                            {
309                                next.body_start = self.position;
310                            }
311
312                            // Only emit HeredocBody if requested (for folding)
313                            if self.emit_heredoc_body_tokens {
314                                return Some(Token {
315                                    token_type: TokenType::HeredocBody(empty_arc()),
316                                    text: empty_arc(),
317                                    start: body_start,
318                                    end: line_start,
319                                });
320                            }
321                            // Otherwise, continue the outer loop to get the next real token (avoiding recursion)
322                            break; // Break inner while loop, continue outer loop
323                        }
324
325                        // Not the terminator, continue to next line
326                        self.position = line_end;
327                        self.consume_newline();
328                    }
329
330                    // If we didn't find a terminator, we reached EOF - emit error token
331                    if !found_terminator {
332                        // Remove the pending heredoc to avoid infinite loop
333                        self.pending_heredocs.remove(0);
334                        self.position = self.input.len();
335                        return Some(Token {
336                            token_type: TokenType::UnknownRest,
337                            text: Arc::from(&self.input[body_start..]),
338                            start: body_start,
339                            end: self.input.len(),
340                        });
341                    }
342                }
343
344                // If we found a terminator, continue outer loop to get next token
345                if found_terminator {
346                    continue; // Continue outer loop to get next token
347                }
348            }
349
350            self.skip_whitespace_and_comments()?;
351
352            // Check again if we're now in a heredoc body (might have been set during skip_whitespace)
353            if !self.pending_heredocs.is_empty()
354                && let Some(spec) = self.pending_heredocs.first()
355                && spec.body_start > 0
356                && self.position >= spec.body_start
357                && self.position < self.input.len()
358            {
359                continue; // Go back to top of loop to process heredoc
360            }
361
362            // If we reach EOF with pending heredocs, clear them and emit EOF
363            if self.position >= self.input.len() && !self.pending_heredocs.is_empty() {
364                self.pending_heredocs.clear();
365            }
366
367            if self.position >= self.input.len() {
368                if self.eof_emitted {
369                    return None; // Stop the stream
370                }
371                self.eof_emitted = true;
372                return Some(Token {
373                    token_type: TokenType::EOF,
374                    text: empty_arc(),
375                    start: self.position,
376                    end: self.position,
377                });
378            }
379
380            let start = self.position;
381
382            // Check for special tokens first
383            if let Some(token) = self.try_heredoc() {
384                return Some(token);
385            }
386
387            if let Some(token) = self.try_string() {
388                return Some(token);
389            }
390
391            if let Some(token) = self.try_variable() {
392                return Some(token);
393            }
394
395            if let Some(token) = self.try_number() {
396                return Some(token);
397            }
398
399            if let Some(token) = self.try_vstring() {
400                return Some(token);
401            }
402
403            if let Some(token) = self.try_identifier_or_keyword() {
404                return Some(token);
405            }
406
407            // If we're expecting a delimiter for a quote operator, only try delimiter
408            if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
409                if let Some(token) = self.try_delimiter() {
410                    return Some(token);
411                }
412                // Do NOT fall through to try_operator / try_punct / etc.
413                // Clear state first so we don't spin
414                self.mode = LexerMode::ExpectOperator;
415                self.current_quote_op = None;
416                continue;
417            }
418
419            if let Some(token) = self.try_operator() {
420                return Some(token);
421            }
422
423            if let Some(token) = self.try_delimiter() {
424                return Some(token);
425            }
426
427            // If nothing else matches, return an error token
428            let ch = self.current_char()?;
429            self.advance();
430
431            // Optimize error token creation - avoid expensive formatting in hot path
432            let text = if ch.is_ascii() {
433                // Fast path for ASCII characters
434                Arc::from(&self.input[start..self.position])
435            } else {
436                // Unicode path without intermediate heap allocation
437                let mut buf = [0_u8; 4];
438                Arc::from(ch.encode_utf8(&mut buf))
439            };
440
441            return Some(Token {
442                token_type: TokenType::Error(Arc::from("Unexpected character")),
443                text,
444                start,
445                end: self.position,
446            });
447        } // End of loop
448    }
449
450    /// Budget guard to prevent infinite loops and timeouts (Issue #422)
451    ///
452    /// **Purpose**: Protect against pathological input that could cause:
453    /// - Infinite loops in regex/heredoc parsing
454    /// - Excessive memory consumption
455    /// - LSP server hangs
456    ///
457    /// **Limits**:
458    /// - `MAX_REGEX_BYTES` (64KB): Maximum bytes in a single regex literal
459    /// - `MAX_DELIM_NEST` (128): Maximum delimiter nesting depth
460    ///
461    /// **Graceful Degradation**:
462    /// - Budget exceeded → emit `UnknownRest` token
463    /// - Jump to EOF to prevent further parsing of problematic region
464    /// - LSP client can emit soft diagnostic about truncation
465    /// - All previously parsed symbols remain valid
466    ///
467    /// **Performance**:
468    /// - Fast path: inlined subtraction + comparison (~1-2 CPU cycles)
469    /// - Slow path: Only triggered on pathological input
470    /// - Amortized cost: O(1) per token
471    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
472    #[inline(always)]
473    fn budget_guard(&mut self, start: usize, depth: usize) -> Option<Token> {
474        // Fast path: most calls won't hit limits
475        let bytes_consumed = self.position - start;
476        if bytes_consumed <= MAX_REGEX_BYTES && depth <= MAX_DELIM_NEST {
477            return None;
478        }
479
480        // Slow path: budget exceeded - graceful degradation
481        #[cfg(debug_assertions)]
482        {
483            tracing::debug!(
484                bytes_consumed,
485                depth,
486                position = self.position,
487                "Lexer budget exceeded"
488            );
489        }
490
491        self.position = self.input.len();
492        Some(Token {
493            token_type: TokenType::UnknownRest,
494            text: Arc::from(""),
495            start,
496            end: self.position,
497        })
498    }
499
500    /// Peek at the next token without consuming it.
501    ///
502    /// Saves and restores the full lexer state so the next call to
503    /// [`next_token`](Self::next_token) returns the same token.
504    pub fn peek_token(&mut self) -> Option<Token> {
505        let saved_pos = self.position;
506        let saved_mode = self.mode;
507        let saved_delimiter_stack = self.delimiter_stack.clone();
508        let saved_prototype = self.in_prototype;
509        let saved_depth = self.prototype_depth;
510        let saved_after_sub = self.after_sub;
511        let saved_after_arrow = self.after_arrow;
512        let saved_hash_brace_depth = self.hash_brace_depth;
513        let saved_after_var_subscript = self.after_var_subscript;
514        let saved_paren_depth = self.paren_depth;
515        let saved_current_pos = self.current_pos;
516        let saved_after_newline = self.after_newline;
517        let saved_pending_heredocs = self.pending_heredocs.clone();
518        let saved_line_start_offset = self.line_start_offset;
519        let saved_current_quote_op = self.current_quote_op.clone();
520        let saved_eof_emitted = self.eof_emitted;
521        let saved_start_time = self.start_time;
522
523        let token = self.next_token();
524
525        self.position = saved_pos;
526        self.mode = saved_mode;
527        self.delimiter_stack = saved_delimiter_stack;
528        self.in_prototype = saved_prototype;
529        self.prototype_depth = saved_depth;
530        self.after_sub = saved_after_sub;
531        self.after_arrow = saved_after_arrow;
532        self.hash_brace_depth = saved_hash_brace_depth;
533        self.after_var_subscript = saved_after_var_subscript;
534        self.paren_depth = saved_paren_depth;
535        self.current_pos = saved_current_pos;
536        self.after_newline = saved_after_newline;
537        self.pending_heredocs = saved_pending_heredocs;
538        self.line_start_offset = saved_line_start_offset;
539        self.current_quote_op = saved_current_quote_op;
540        self.eof_emitted = saved_eof_emitted;
541        self.start_time = saved_start_time;
542
543        token
544    }
545
546    /// Consume all remaining tokens and return them as a vector.
547    ///
548    /// The returned vector always ends with an `EOF` token.
549    pub fn collect_tokens(&mut self) -> Vec<Token> {
550        let mut tokens = Vec::new();
551        while let Some(token) = self.next_token() {
552            if token.token_type == TokenType::EOF {
553                tokens.push(token);
554                break;
555            }
556            tokens.push(token);
557        }
558        tokens
559    }
560
561    /// Reset the lexer to the beginning of the input.
562    ///
563    /// Clears all internal state (mode, delimiter stack, heredoc queue, etc.)
564    /// so the lexer can re-tokenize the same source from scratch.
565    pub fn reset(&mut self) {
566        self.position = 0;
567        self.mode = LexerMode::ExpectTerm;
568        self.delimiter_stack.clear();
569        self.in_prototype = false;
570        self.prototype_depth = 0;
571        self.after_sub = false;
572        self.after_arrow = false;
573        self.hash_brace_depth = 0;
574        self.after_var_subscript = false;
575        self.paren_depth = 0;
576        self.current_pos = Position::start();
577        self.after_newline = true;
578        self.pending_heredocs.clear();
579        self.line_start_offset = 0;
580        self.current_quote_op = None;
581        self.eof_emitted = false;
582        self.start_time = std::time::Instant::now();
583    }
584
585    /// Switch the lexer into format-body parsing mode.
586    ///
587    /// In this mode the lexer consumes input verbatim until it encounters a
588    /// line containing only `.` (the Perl format terminator).
589    pub fn enter_format_mode(&mut self) {
590        self.mode = LexerMode::InFormatBody;
591    }
592
593    // Internal helper methods
594
595    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
596    #[inline(always)]
597    fn byte_at(bytes: &[u8], index: usize) -> u8 {
598        debug_assert!(index < bytes.len());
599        match bytes.get(index) {
600            Some(&byte) => byte,
601            None => 0,
602        }
603    }
604
605    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
606    #[inline(always)]
607    fn current_char(&self) -> Option<char> {
608        if self.position < self.input_bytes.len() {
609            // For ASCII, direct access is safe
610            let byte = Self::byte_at(self.input_bytes, self.position);
611            if byte < 128 {
612                Some(byte as char)
613            } else {
614                // For non-ASCII, fall back to proper UTF-8 parsing
615                self.input.get(self.position..).and_then(|s| s.chars().next())
616            }
617        } else {
618            None
619        }
620    }
621
622    #[inline(always)]
623    fn peek_char(&self, offset: usize) -> Option<char> {
624        if offset > self.config.max_lookahead {
625            return None;
626        }
627
628        let pos = self.position.checked_add(offset)?;
629        if pos < self.input_bytes.len() {
630            // For ASCII, direct access is safe
631            let byte = Self::byte_at(self.input_bytes, pos);
632            if byte < 128 {
633                Some(byte as char)
634            } else {
635                // For non-ASCII, use chars iterator
636                self.input.get(self.position..).and_then(|s| s.chars().nth(offset))
637            }
638        } else {
639            None
640        }
641    }
642
643    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
644    #[inline(always)]
645    fn advance(&mut self) {
646        if self.position < self.input_bytes.len() {
647            let byte = Self::byte_at(self.input_bytes, self.position);
648            if byte < 128 {
649                // ASCII fast path
650                self.position += 1;
651            } else if let Some(ch) = self.input.get(self.position..).and_then(|s| s.chars().next())
652            {
653                self.position += ch.len_utf8();
654            }
655        }
656    }
657
658    /// General-purpose balanced-segment consumer (no quote-boundary recovery).
659    ///
660    /// For use inside double-quoted string interpolation where the outer `"` must
661    /// act as a recovery boundary, use [`consume_balanced_segment_in_string`] instead.
662    #[allow(dead_code)]
663    #[inline]
664    fn consume_balanced_segment(&mut self, open: char, close: char) -> Option<usize> {
665        if self.current_char() != Some(open) {
666            return None;
667        }
668
669        let mut depth = 1usize;
670        self.advance();
671        while let Some(ch) = self.current_char() {
672            match ch {
673                '\\' => {
674                    self.advance();
675                    if self.current_char().is_some() {
676                        self.advance();
677                    }
678                }
679                c if c == open => {
680                    depth += 1;
681                    self.advance();
682                }
683                c if c == close => {
684                    self.advance();
685                    depth -= 1;
686                    if depth == 0 {
687                        return Some(self.position);
688                    }
689                }
690                _ => self.advance(),
691            }
692        }
693
694        None
695    }
696
697    #[inline]
698    fn consume_balanced_segment_in_string(
699        &mut self,
700        open: char,
701        close: char,
702        terminator: char,
703    ) -> Option<usize> {
704        if self.current_char() != Some(open) {
705            return None;
706        }
707
708        let mut depth = 1usize;
709        self.advance();
710        while let Some(ch) = self.current_char() {
711            match ch {
712                '\\' => {
713                    self.advance();
714                    if self.current_char().is_some() {
715                        self.advance();
716                    }
717                }
718                c if c == terminator => {
719                    // Local recovery for interpolation tails in quoted strings:
720                    // stop at the closing quote so the outer string parser can
721                    // still terminate this token cleanly.
722                    return None;
723                }
724                c if c == open => {
725                    depth += 1;
726                    self.advance();
727                }
728                c if c == close => {
729                    self.advance();
730                    depth -= 1;
731                    if depth == 0 {
732                        return Some(self.position);
733                    }
734                }
735                _ => self.advance(),
736            }
737        }
738
739        None
740    }
741
742    /// Fast byte-level check for ASCII characters
743    #[inline]
744    fn peek_byte(&self, offset: usize) -> Option<u8> {
745        if offset > self.config.max_lookahead {
746            return None;
747        }
748
749        let pos = self.position.checked_add(offset)?;
750        if pos < self.input_bytes.len() { Some(self.input_bytes[pos]) } else { None }
751    }
752
753    /// Check if the next bytes match a pattern (ASCII only)
754    #[inline]
755    fn matches_bytes(&self, pattern: &[u8]) -> bool {
756        let Some(end_offset) = pattern.len().checked_sub(1) else {
757            return true;
758        };
759
760        if end_offset > self.config.max_lookahead {
761            return false;
762        }
763
764        let Some(end) = self.position.checked_add(pattern.len()) else {
765            return false;
766        };
767
768        if end <= self.input_bytes.len() {
769            &self.input_bytes[self.position..end] == pattern
770        } else {
771            false
772        }
773    }
774
775    #[inline]
776    fn skip_whitespace_and_comments(&mut self) -> Option<()> {
777        // Don't reset after_newline if we're at the start of a line
778        if self.position > 0 && self.position != self.line_start_offset {
779            self.after_newline = false;
780        }
781
782        while self.position < self.input_bytes.len() {
783            let byte = Self::byte_at(self.input_bytes, self.position);
784            match byte {
785                // Fast path for ASCII whitespace - batch process
786                b' ' => {
787                    // Batch skip spaces for better cache efficiency
788                    let start = self.position;
789                    while self.position < self.input_bytes.len()
790                        && Self::byte_at(self.input_bytes, self.position) == b' '
791                    {
792                        self.position += 1;
793                    }
794                    // Continue outer loop if we processed any spaces
795                    if self.position > start {
796                        // Loop naturally continues to next iteration
797                    }
798                }
799                b'\t' | 0x0B | 0x0C => {
800                    // Batch skip horizontal tab, vertical tab, and form feed.
801                    // Perl treats these as whitespace separators.
802                    let start = self.position;
803                    while self.position < self.input_bytes.len()
804                        && matches!(
805                            Self::byte_at(self.input_bytes, self.position),
806                            b'\t' | 0x0B | 0x0C
807                        )
808                    {
809                        self.position += 1;
810                    }
811                    if self.position > start {
812                        // Loop naturally continues to next iteration
813                    }
814                }
815                b'\r' | b'\n' => {
816                    self.consume_newline();
817
818                    // Set body_start for the FIRST pending heredoc that needs it (FIFO)
819                    // Only check if we have pending heredocs to avoid unnecessary work
820                    if !self.pending_heredocs.is_empty() {
821                        for spec in &mut self.pending_heredocs {
822                            if spec.body_start == 0 {
823                                spec.body_start = self.position;
824                                break; // Only set for the first unresolved heredoc
825                            }
826                        }
827                    }
828                }
829                b'#' => {
830                    // In ExpectDelimiter mode, '#' is a delimiter, not a comment
831                    if matches!(self.mode, LexerMode::ExpectDelimiter) {
832                        break;
833                    }
834
835                    // Skip line comment using memchr for fast newline search
836                    self.position += 1; // Skip # directly
837
838                    // Use memchr2 to find CR/LF line endings quickly (supports LF, CRLF, and CR)
839                    if let Some(newline_offset) =
840                        memchr::memchr2(b'\n', b'\r', &self.input_bytes[self.position..])
841                    {
842                        self.position += newline_offset;
843                    } else {
844                        // No newline found, skip to end
845                        self.position = self.input_bytes.len();
846                    }
847                }
848                b'=' if self.position == 0
849                    || (self.position > 0
850                        && matches!(self.input_bytes[self.position - 1], b'\n' | b'\r')) =>
851                {
852                    // Check if this starts a POD section (=pod, =head, =over, etc.)
853                    // Use byte-safe checks — avoid slicing &str at arbitrary byte positions
854                    let remaining = &self.input_bytes[self.position..];
855                    if remaining.starts_with(b"=pod")
856                        || remaining.starts_with(b"=head")
857                        || remaining.starts_with(b"=over")
858                        || remaining.starts_with(b"=item")
859                        || remaining.starts_with(b"=back")
860                        || remaining.starts_with(b"=begin")
861                        || remaining.starts_with(b"=end")
862                        || remaining.starts_with(b"=for")
863                        || remaining.starts_with(b"=encoding")
864                    {
865                        // Scan forward for \n=cut (end of POD block)
866                        let search_start = self.position;
867                        let mut found_cut = false;
868                        let bytes = self.input_bytes;
869                        let mut i = search_start;
870                        while i < bytes.len() {
871                            // Look for =cut at the start of a line
872                            if (i == 0 || matches!(bytes[i - 1], b'\n' | b'\r'))
873                                && bytes[i..].starts_with(b"=cut")
874                            {
875                                i += 4; // Skip "=cut"
876                                // Skip rest of the =cut line
877                                while i < bytes.len() && bytes[i] != b'\n' && bytes[i] != b'\r' {
878                                    i += 1;
879                                }
880                                // Consume one line ending sequence if present
881                                if i < bytes.len() && bytes[i] == b'\r' {
882                                    i += 1;
883                                    if i < bytes.len() && bytes[i] == b'\n' {
884                                        i += 1;
885                                    }
886                                } else if i < bytes.len() && bytes[i] == b'\n' {
887                                    i += 1;
888                                }
889                                self.position = i;
890                                found_cut = true;
891                                break;
892                            }
893                            i += 1;
894                        }
895                        if !found_cut {
896                            // POD extends to end of file
897                            self.position = bytes.len();
898                        }
899                        continue;
900                    }
901                    // Not a POD directive - regular '=' token
902                    break;
903                }
904                _ => {
905                    // For non-ASCII whitespace, use char check only when needed
906                    if byte >= 128
907                        && let Some(ch) = self.current_char()
908                        && ch.is_whitespace()
909                    {
910                        self.advance();
911                        continue;
912                    }
913                    break;
914                }
915            }
916        }
917        Some(())
918    }
919
920    fn try_heredoc(&mut self) -> Option<Token> {
921        // `<<` is the left-shift operator, not a heredoc, when we are inside
922        // a parenthesized expression and have just finished a term.
923        // E.g. `(1<<index(...))` — the `1` sets ExpectOperator and paren_depth > 0,
924        // so `<<index` must be the bitshift operator, not a heredoc start.
925        //
926        // We must NOT fire the guard at statement level (paren_depth == 0) because
927        // `print $fh <<END` is valid Perl: `$fh` sets ExpectOperator but `<<END`
928        // is a heredoc.  The depth check distinguishes the two cases.
929        if self.mode == LexerMode::ExpectOperator && self.paren_depth > 0 {
930            return None;
931        }
932
933        // Check for heredoc start
934        if self.peek_byte(0) != Some(b'<') || self.peek_byte(1) != Some(b'<') {
935            return None;
936        }
937
938        let start = self.position;
939        let mut text = String::from("<<");
940        self.position += 2; // Skip <<
941
942        // Check for indented heredoc (~)
943        let allow_indent = if self.current_char() == Some('~') {
944            text.push('~');
945            self.advance();
946            true
947        } else {
948            false
949        };
950
951        // Skip whitespace
952        while let Some(ch) = self.current_char() {
953            if ch == ' ' || ch == '\t' {
954                text.push(ch);
955                self.advance();
956            } else {
957                break;
958            }
959        }
960
961        // Optional backslash disables interpolation, treat like single-quoted label
962        let backslashed = if self.current_char() == Some('\\') {
963            text.push('\\');
964            self.advance();
965            true
966        } else {
967            false
968        };
969
970        // Parse delimiter
971        let delimiter = if self.position < self.input.len() {
972            match self.current_char() {
973                Some('"') if !backslashed => self.parse_quoted_heredoc_delimiter('"', &mut text)?,
974                Some('\'') if !backslashed => {
975                    self.parse_quoted_heredoc_delimiter('\'', &mut text)?
976                }
977                Some('`') if !backslashed => self.parse_quoted_heredoc_delimiter('`', &mut text)?,
978                Some(c) if is_perl_identifier_start(c) => {
979                    // Bare word delimiter
980                    let mut delim = String::new();
981                    while self.position < self.input.len() {
982                        if let Some(c) = self.current_char() {
983                            if is_perl_identifier_continue(c) {
984                                delim.push(c);
985                                text.push(c);
986                                self.advance();
987                            } else {
988                                break;
989                            }
990                        } else {
991                            break;
992                        }
993                    }
994                    delim
995                }
996                _ => {
997                    // Not a valid heredoc delimiter - reset position and return None
998                    // This allows << to be parsed as bitshift operator (e.g., 1 << 2)
999                    self.position = start;
1000                    return None;
1001                }
1002            }
1003        } else {
1004            // No delimiter found - reset position and return None
1005            self.position = start;
1006            return None;
1007        };
1008
1009        // For now, return a placeholder token
1010        // The actual heredoc body would be parsed later when we encounter it
1011        self.mode = LexerMode::ExpectOperator;
1012
1013        // Recursion depth limit (Issue #443)
1014        if self.pending_heredocs.len() >= MAX_HEREDOC_DEPTH {
1015            return Some(Token {
1016                token_type: TokenType::Error(Arc::from("Heredoc nesting too deep")),
1017                text: Arc::from(text),
1018                start,
1019                end: self.position,
1020            });
1021        }
1022
1023        // Queue the heredoc spec with its label
1024        self.pending_heredocs.push(HeredocSpec {
1025            label: Arc::from(delimiter.as_str()),
1026            body_start: 0, // Will be set when we see the newline after this line
1027            allow_indent,
1028        });
1029
1030        Some(Token {
1031            token_type: TokenType::HeredocStart,
1032            text: Arc::from(text),
1033            start,
1034            end: self.position,
1035        })
1036    }
1037
1038    fn try_string(&mut self) -> Option<Token> {
1039        let start = self.position;
1040        let quote = self.current_char()?;
1041
1042        match quote {
1043            '"' => self.parse_double_quoted_string(start),
1044            '\'' => self.parse_single_quoted_string(start),
1045            '`' => self.parse_backtick_string(start),
1046            'q' if self.peek_char(1) == Some('{') => self.parse_q_string(start),
1047            _ => None,
1048        }
1049    }
1050
1051    #[inline]
1052    fn try_number(&mut self) -> Option<Token> {
1053        let start = self.position;
1054
1055        // Fast byte check for digits - optimized bounds checking
1056        let bytes = self.input_bytes;
1057        if self.position >= bytes.len() || !Self::byte_at(bytes, self.position).is_ascii_digit() {
1058            return None;
1059        }
1060
1061        // Check for hex (0x), binary (0b), or octal (0o) prefixes
1062        let mut pos = self.position;
1063        if Self::byte_at(bytes, pos) == b'0' && pos + 1 < bytes.len() {
1064            let prefix_byte = bytes[pos + 1];
1065            if prefix_byte == b'x' || prefix_byte == b'X' {
1066                // Hexadecimal: 0x[0-9a-fA-F_]+
1067                pos += 2; // consume '0x'
1068                let digit_start = pos;
1069                let mut saw_digit = false;
1070                while pos < bytes.len() && (bytes[pos].is_ascii_hexdigit() || bytes[pos] == b'_') {
1071                    saw_digit |= bytes[pos].is_ascii_hexdigit();
1072                    pos += 1;
1073                }
1074                if pos > digit_start && saw_digit {
1075                    self.position = pos;
1076                    let text = &self.input[start..self.position];
1077                    self.mode = LexerMode::ExpectOperator;
1078                    return Some(Token {
1079                        token_type: TokenType::Number(Arc::from(text)),
1080                        text: Arc::from(text),
1081                        start,
1082                        end: self.position,
1083                    });
1084                }
1085                // No hex digits after 0x - fall through to parse '0' as decimal
1086            } else if prefix_byte == b'b' || prefix_byte == b'B' {
1087                // Binary: 0b[01_]+
1088                pos += 2; // consume '0b'
1089                let digit_start = pos;
1090                let mut saw_digit = false;
1091                while pos < bytes.len()
1092                    && (bytes[pos] == b'0' || bytes[pos] == b'1' || bytes[pos] == b'_')
1093                {
1094                    saw_digit |= bytes[pos] == b'0' || bytes[pos] == b'1';
1095                    pos += 1;
1096                }
1097                if pos > digit_start && saw_digit {
1098                    self.position = pos;
1099                    let text = &self.input[start..self.position];
1100                    self.mode = LexerMode::ExpectOperator;
1101                    return Some(Token {
1102                        token_type: TokenType::Number(Arc::from(text)),
1103                        text: Arc::from(text),
1104                        start,
1105                        end: self.position,
1106                    });
1107                }
1108                // No binary digits after 0b - fall through to parse '0' as decimal
1109            } else if prefix_byte == b'o' || prefix_byte == b'O' {
1110                // Octal (explicit): 0o[0-7_]+
1111                pos += 2; // consume '0o'
1112                let digit_start = pos;
1113                let mut saw_digit = false;
1114                while pos < bytes.len()
1115                    && ((bytes[pos] >= b'0' && bytes[pos] <= b'7') || bytes[pos] == b'_')
1116                {
1117                    saw_digit |= (b'0'..=b'7').contains(&bytes[pos]);
1118                    pos += 1;
1119                }
1120                if pos > digit_start && saw_digit {
1121                    self.position = pos;
1122                    let text = &self.input[start..self.position];
1123                    self.mode = LexerMode::ExpectOperator;
1124                    return Some(Token {
1125                        token_type: TokenType::Number(Arc::from(text)),
1126                        text: Arc::from(text),
1127                        start,
1128                        end: self.position,
1129                    });
1130                }
1131                // No octal digits after 0o - fall through to parse '0' as decimal
1132            }
1133        }
1134
1135        // Consume initial digits - unrolled for better performance
1136        pos = self.position;
1137        while pos < bytes.len() {
1138            let byte = Self::byte_at(bytes, pos);
1139            if byte.is_ascii_digit() || byte == b'_' {
1140                pos += 1;
1141            } else {
1142                break;
1143            }
1144        }
1145        self.position = pos;
1146
1147        // Check for decimal point - optimized with single bounds check
1148        if pos < bytes.len() && Self::byte_at(bytes, pos) == b'.' {
1149            // Peek ahead to see what follows the dot
1150            let has_following_digit = pos + 1 < bytes.len() && bytes[pos + 1].is_ascii_digit();
1151
1152            // Optimized dot consumption logic
1153            let should_consume_dot = has_following_digit || {
1154                pos + 1 >= bytes.len() || {
1155                    // Use bitwise operations for faster character classification
1156                    let next_byte = bytes[pos + 1];
1157                    // Whitespace, delimiters, operators - optimized check
1158                    next_byte <= b' '
1159                        || matches!(
1160                            next_byte,
1161                            b';' | b','
1162                                | b')'
1163                                | b'}'
1164                                | b']'
1165                                | b'+'
1166                                | b'-'
1167                                | b'*'
1168                                | b'/'
1169                                | b'%'
1170                                | b'='
1171                                | b'<'
1172                                | b'>'
1173                                | b'!'
1174                                | b'&'
1175                                | b'|'
1176                                | b'^'
1177                                | b'~'
1178                                | b'e'
1179                                | b'E'
1180                        )
1181                }
1182            };
1183
1184            if should_consume_dot {
1185                pos += 1; // consume the dot
1186                // Consume fractional digits - batch processing
1187                while pos < bytes.len() && (bytes[pos].is_ascii_digit() || bytes[pos] == b'_') {
1188                    pos += 1;
1189                }
1190                self.position = pos;
1191            }
1192        }
1193
1194        // Check for exponent - optimized
1195        if pos < bytes.len() && (bytes[pos] == b'e' || bytes[pos] == b'E') {
1196            let exp_start = pos;
1197            pos += 1; // consume 'e' or 'E'
1198
1199            // Check for optional sign
1200            if pos < bytes.len() && (bytes[pos] == b'+' || bytes[pos] == b'-') {
1201                pos += 1;
1202            }
1203
1204            // Must have at least one digit after exponent (underscores allowed between digits)
1205            let mut saw_digit = false;
1206            while pos < bytes.len() {
1207                let byte = bytes[pos];
1208                if byte.is_ascii_digit() {
1209                    saw_digit = true;
1210                    pos += 1;
1211                } else if byte == b'_' {
1212                    pos += 1;
1213                } else {
1214                    break;
1215                }
1216            }
1217
1218            // If no digits after exponent, backtrack
1219            if !saw_digit {
1220                pos = exp_start;
1221            }
1222
1223            self.position = pos;
1224        }
1225
1226        // Avoid string slicing for common number cases - use Arc::from directly on slice
1227        let text = &self.input[start..self.position];
1228        self.mode = LexerMode::ExpectOperator;
1229
1230        Some(Token {
1231            token_type: TokenType::Number(Arc::from(text)),
1232            text: Arc::from(text),
1233            start,
1234            end: self.position,
1235        })
1236    }
1237
1238    fn parse_decimal_number(&mut self, start: usize) -> Option<Token> {
1239        // We're at the dot, consume it
1240        self.advance();
1241
1242        // Parse the fractional part
1243        while self.position < self.input_bytes.len() {
1244            let byte = self.input_bytes[self.position];
1245            match byte {
1246                b'0'..=b'9' | b'_' => self.position += 1,
1247                b'e' | b'E' => {
1248                    // Handle scientific notation.
1249                    // Save the position of 'e'/'E' so we can backtrack here if
1250                    // no digits follow the exponent marker (with or without sign).
1251                    let e_pos = self.position;
1252                    self.advance();
1253                    if self.position < self.input_bytes.len() {
1254                        let next = self.input_bytes[self.position];
1255                        if next == b'+' || next == b'-' {
1256                            self.advance();
1257                        }
1258                    }
1259                    // Parse exponent digits (underscores allowed between digits)
1260                    let exponent_start = self.position;
1261                    let mut saw_digit = false;
1262                    while self.position < self.input_bytes.len() {
1263                        let byte = self.input_bytes[self.position];
1264                        if byte.is_ascii_digit() {
1265                            saw_digit = true;
1266                            self.position += 1;
1267                        } else if byte == b'_' {
1268                            self.position += 1;
1269                        } else {
1270                            break;
1271                        }
1272                    }
1273
1274                    // No digits after exponent marker — backtrack to just before
1275                    // 'e'/'E' so the caller sees it as a separate token.
1276                    // Using e_pos (not exponent_start-1) avoids including 'e' in
1277                    // the number slice when a sign character was consumed.
1278                    if !saw_digit {
1279                        let _ = exponent_start; // mark as intentionally unused
1280                        self.position = e_pos;
1281                    }
1282                    break;
1283                }
1284                _ => break,
1285            }
1286        }
1287
1288        let text = &self.input[start..self.position];
1289        self.mode = LexerMode::ExpectOperator;
1290
1291        Some(Token {
1292            token_type: TokenType::Number(Arc::from(text)),
1293            text: Arc::from(text),
1294            start,
1295            end: self.position,
1296        })
1297    }
1298
1299    fn try_variable(&mut self) -> Option<Token> {
1300        let start = self.position;
1301        let sigil = self.current_char()?;
1302
1303        match sigil {
1304            '$' | '@' | '%' | '*' => {
1305                // In ExpectOperator mode, treat % and * as operators rather than sigils
1306                if self.mode == LexerMode::ExpectOperator && matches!(sigil, '*' | '%') {
1307                    return None;
1308                }
1309                self.advance();
1310
1311                // Special case: After ->, sigils followed by { or [ should be tokenized separately
1312                // This is for postfix dereference like ->@*, ->%{}, ->@[]
1313                // We need to be careful with Unicode - check if we have enough bytes and valid char boundaries
1314                let check_arrow = self.position >= 3
1315                    && self.position.saturating_sub(1) <= self.input.len()
1316                    && self.input.is_char_boundary(self.position.saturating_sub(3))
1317                    && self.input.is_char_boundary(self.position.saturating_sub(1));
1318
1319                if check_arrow
1320                    && {
1321                        let saved = self.position;
1322                        self.position -= 3;
1323                        let arrow = self.matches_bytes(b"->");
1324                        self.position = saved;
1325                        arrow
1326                    }
1327                    && matches!(self.current_char(), Some('{' | '[' | '*'))
1328                {
1329                    // Just return the sigil
1330                    let text = &self.input[start..self.position];
1331                    self.mode = LexerMode::ExpectOperator;
1332
1333                    return Some(Token {
1334                        token_type: TokenType::Identifier(Arc::from(text)),
1335                        text: Arc::from(text),
1336                        start,
1337                        end: self.position,
1338                    });
1339                }
1340
1341                // Check for $# (array length operator)
1342                if sigil == '$' && self.current_char() == Some('#') {
1343                    self.advance(); // consume #
1344                    // Now parse the array name
1345                    while let Some(ch) = self.current_char() {
1346                        if is_perl_identifier_continue(ch) {
1347                            self.advance();
1348                        } else if ch == ':' && self.peek_char(1) == Some(':') {
1349                            // Package-qualified array name
1350                            self.advance();
1351                            self.advance();
1352                        } else {
1353                            break;
1354                        }
1355                    }
1356
1357                    let text = &self.input[start..self.position];
1358                    self.mode = LexerMode::ExpectOperator;
1359                    // $#foo is a complete variable token; a following `{` is a subscript.
1360                    self.after_var_subscript = true;
1361
1362                    return Some(Token {
1363                        token_type: TokenType::Identifier(Arc::from(text)),
1364                        text: Arc::from(text),
1365                        start,
1366                        end: self.position,
1367                    });
1368                }
1369
1370                // Check for special cases like ${^MATCH} or ${::{foo}} or *{$glob}
1371                if self.current_char() == Some('{') {
1372                    // Peek ahead to decide if we should consume the brace
1373                    let next_char = self.peek_char(1);
1374
1375                    // Check if this is a dereference like @{$ref} or @{[...]}
1376                    // If the next char suggests dereference, don't consume the brace.
1377                    // For @ and % sigils, identifiers inside braces are also derefs
1378                    // (e.g. @{Foo::Bar::baz} or %{Some::Hash}).
1379                    let is_deref = sigil != '*'
1380                        && (matches!(
1381                            next_char,
1382                            Some('$' | '@' | '%' | '*' | '&' | '[' | ' ' | '\t' | '\n' | '\r',)
1383                        ) || (matches!(sigil, '@' | '%')
1384                            && next_char.is_some_and(is_perl_identifier_start)));
1385                    if is_deref {
1386                        // This is a dereference, don't consume the brace
1387                        let text = &self.input[start..self.position];
1388                        self.mode = LexerMode::ExpectOperator;
1389                        // A standalone sigil token before `{` starts a dereference
1390                        // sequence (e.g. `${$ref}` / `@{$aref}` / `%{$href}` / `&{$cref}`).
1391                        // Mark it as subscript-capable so `{` increments brace depth
1392                        // and the closing `}` can enable chained `{...}` subscripts.
1393                        // (Broader form than master's `$|@|%` filter — `*` is already
1394                        // excluded by the `is_deref` guard above and `&` deref also
1395                        // benefits from chained-subscript handling.)
1396                        self.after_var_subscript = true;
1397
1398                        return Some(Token {
1399                            token_type: TokenType::Identifier(Arc::from(text)),
1400                            text: Arc::from(text),
1401                            start,
1402                            end: self.position,
1403                        });
1404                    }
1405
1406                    self.advance(); // consume {
1407
1408                    // Handle special variables with caret
1409                    if self.current_char() == Some('^') {
1410                        self.advance(); // consume ^
1411                        // Parse the special variable name
1412                        while let Some(ch) = self.current_char() {
1413                            if ch == '}' {
1414                                self.advance(); // consume }
1415                                break;
1416                            } else if is_perl_identifier_continue(ch) {
1417                                self.advance();
1418                            } else {
1419                                break;
1420                            }
1421                        }
1422                    }
1423                    // Handle stash access like $::{foo}
1424                    else if self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1425                        self.advance(); // consume first :
1426                        self.advance(); // consume second :
1427                        // Skip optional { and }
1428                        if self.current_char() == Some('{') {
1429                            self.advance();
1430                        }
1431                        // Parse the name
1432                        while let Some(ch) = self.current_char() {
1433                            if ch == '}' {
1434                                self.advance();
1435                                if self.current_char() == Some('}') {
1436                                    self.advance(); // consume closing } of ${...}
1437                                }
1438                                break;
1439                            } else if is_perl_identifier_continue(ch) {
1440                                self.advance();
1441                            } else {
1442                                break;
1443                            }
1444                        }
1445                    }
1446                    // Regular braced variable like ${foo} or glob like *{$glob}
1447                    else {
1448                        // Check if this is a dereference like ${$ref} or @{$ref} or @{[...]}
1449                        // If the next char is a sigil or other expression starter, we should stop here and let the parser handle it
1450                        // EXCEPT for globs - *{$glob} should be parsed as one token
1451                        // Also check for empty braces or EOF - in these cases we should split the tokens
1452                        if sigil != '*'
1453                            && !self.current_char().is_some_and(is_perl_identifier_start)
1454                        {
1455                            // This is a dereference or empty/invalid brace, backtrack
1456                            self.position = start + 1; // Just past the sigil
1457                            let text = &self.input[start..self.position];
1458                            self.mode = LexerMode::ExpectOperator;
1459                            // Same as above: sigil-only token means a dereference opener.
1460                            self.after_var_subscript = true;
1461
1462                            return Some(Token {
1463                                token_type: TokenType::Identifier(Arc::from(text)),
1464                                text: Arc::from(text),
1465                                start,
1466                                end: self.position,
1467                            });
1468                        }
1469
1470                        // For glob access, we need to consume everything inside braces
1471                        if sigil == '*' {
1472                            let mut brace_depth: usize = 1;
1473                            while let Some(ch) = self.current_char() {
1474                                if ch == '{' {
1475                                    brace_depth += 1;
1476                                } else if ch == '}' {
1477                                    brace_depth = brace_depth.saturating_sub(1);
1478                                    if brace_depth == 0 {
1479                                        self.advance(); // consume final }
1480                                        break;
1481                                    }
1482                                }
1483                                self.advance();
1484                            }
1485                        } else {
1486                            // Regular variable
1487                            while let Some(ch) = self.current_char() {
1488                                if ch == '}' {
1489                                    self.advance(); // consume }
1490                                    break;
1491                                } else if is_perl_identifier_continue(ch) {
1492                                    self.advance();
1493                                } else {
1494                                    break;
1495                                }
1496                            }
1497                        }
1498                    }
1499                }
1500                // Parse regular variable name
1501                else if let Some(ch) = self.current_char() {
1502                    if is_perl_identifier_start(ch) {
1503                        while let Some(ch) = self.current_char() {
1504                            if is_perl_identifier_continue(ch) {
1505                                self.advance();
1506                            } else {
1507                                break;
1508                            }
1509                        }
1510                        // Handle package-qualified segments like Foo::bar
1511                        while self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1512                            self.advance();
1513                            self.advance();
1514                            while let Some(ch) = self.current_char() {
1515                                if is_perl_identifier_continue(ch) {
1516                                    self.advance();
1517                                } else {
1518                                    break;
1519                                }
1520                            }
1521                        }
1522                    }
1523                    // Handle $^Letter (e.g. $^W, $^O, $^X) and bare $^ (format_top_name)
1524                    // Not inside prototypes where ^ is a literal prototype char
1525                    else if sigil == '$' && ch == '^' && !self.in_prototype {
1526                        self.advance(); // consume ^
1527                        // $^Letter: consume the single uppercase letter
1528                        if let Some(letter) = self.current_char()
1529                            && letter.is_ascii_uppercase()
1530                        {
1531                            self.advance();
1532                        }
1533                        // bare $^ (no uppercase letter follows): format_top_name — stop here
1534                    }
1535                    // Handle special punctuation variables
1536                    // Not inside prototypes where ; and , are literal prototype chars
1537                    else if sigil == '$'
1538                        && !self.in_prototype
1539                        && matches!(
1540                            ch,
1541                            '?' | '!'
1542                                | '@'
1543                                | '&'
1544                                | '`'
1545                                | '\''
1546                                | '.'
1547                                | '/'
1548                                | '\\'
1549                                | '|'
1550                                | '+'
1551                                | '-'
1552                                | '['
1553                                | ']'
1554                                | '$'
1555                                | '~'
1556                                | '='
1557                                | '%'
1558                                | ','
1559                                | '"'
1560                                | ';'
1561                                | '>'
1562                                | '<'
1563                                | ')'
1564                                | '(' // $( = real group ID of this process
1565                        )
1566                    {
1567                        self.advance(); // consume the special character
1568                    }
1569                    // $$ is the PID special variable, but only when it is not immediately
1570                    // followed by an identifier-start character. $$var is scalar dereference
1571                    // of $var, so keep the second $ for the next token.
1572                    else if sigil == '$' && ch == '$' {
1573                        if !self.peek_char(1).is_some_and(is_perl_identifier_start) {
1574                            self.advance(); // consume the second $ for bare $$ PID
1575                        }
1576                    }
1577                    // Handle special array/hash punctuation variables
1578                    else if (sigil == '@' || sigil == '%') && matches!(ch, '+' | '-') {
1579                        self.advance(); // consume the + or -
1580                    }
1581                }
1582
1583                let text = &self.input[start..self.position];
1584                self.mode = LexerMode::ExpectOperator;
1585                // A complete $foo, @foo, %foo token can be followed by a hash/slice
1586                // subscript `{`. Set the flag so the `{` handler knows to increment
1587                // hash_brace_depth. Glob tokens (*foo) are excluded: they don't take
1588                // hash subscripts in the same way.
1589                self.after_var_subscript = matches!(sigil, '$' | '@' | '%');
1590
1591                Some(Token {
1592                    token_type: TokenType::Identifier(Arc::from(text)),
1593                    text: Arc::from(text),
1594                    start,
1595                    end: self.position,
1596                })
1597            }
1598            _ => None,
1599        }
1600    }
1601
1602    /// Return the next non-space char and the char immediately following it (without consuming).
1603    /// Used to detect quote-operator delimiters while distinguishing `=>` (fat-arrow autoquote)
1604    /// from `=` used as a plain delimiter.
1605    fn peek_nonspace_and_following(&self) -> (Option<char>, Option<char>) {
1606        let mut i = self.position;
1607        while i < self.input.len() {
1608            let c = match self.input.get(i..).and_then(|s| s.chars().next()) {
1609                Some(c) => c,
1610                None => return (None, None),
1611            };
1612            if c.is_whitespace() {
1613                i += c.len_utf8();
1614                continue;
1615            }
1616            // Found non-space at position i; peek the next char after it
1617            let j = i + c.len_utf8();
1618            let following = self.input.get(j..).and_then(|s| s.chars().next());
1619            return (Some(c), following);
1620        }
1621        (None, None)
1622    }
1623
1624    /// Is `c` a valid quote-like delimiter? (non-alnum, including paired)
1625    fn is_quote_delim(c: char) -> bool {
1626        // Perl allows any non-alphanumeric, non-whitespace character as delimiter,
1627        // including control characters (e.g. s\x07pattern\x07replacement\x07).
1628        !c.is_ascii_alphanumeric() && !c.is_whitespace()
1629    }
1630
1631    /// Try to parse a v-string (version string) like `v5.26.0` or `v5.10`.
1632    ///
1633    /// A v-string starts with `v` followed by one or more digits, then optionally
1634    /// `.` followed by digits, repeated. The `v` prefix distinguishes these from
1635    /// normal identifiers. Examples: `v5.26.0`, `v5.10`, `v1.2.3.4`.
1636    #[inline]
1637    fn try_vstring(&mut self) -> Option<Token> {
1638        let start = self.position;
1639        let bytes = self.input_bytes;
1640
1641        // Must start with 'v' followed by at least one digit
1642        if start >= bytes.len() || bytes[start] != b'v' {
1643            return None;
1644        }
1645
1646        let next_pos = start + 1;
1647        if next_pos >= bytes.len() || !bytes[next_pos].is_ascii_digit() {
1648            return None;
1649        }
1650
1651        // We have `v` followed by a digit — scan the rest of the v-string.
1652        // Pattern: v DIGITS (.DIGITS)*
1653        let mut pos = next_pos;
1654
1655        // Consume leading digits
1656        while pos < bytes.len() && bytes[pos].is_ascii_digit() {
1657            pos += 1;
1658        }
1659
1660        // Consume optional `.DIGITS` segments (require at least one digit after dot)
1661        while pos < bytes.len() && bytes[pos] == b'.' {
1662            let dot_pos = pos;
1663            pos += 1; // skip '.'
1664
1665            if pos >= bytes.len() || !bytes[pos].is_ascii_digit() {
1666                // Dot not followed by digit — not part of the v-string
1667                pos = dot_pos;
1668                break;
1669            }
1670
1671            // Consume digits after the dot
1672            while pos < bytes.len() && bytes[pos].is_ascii_digit() {
1673                pos += 1;
1674            }
1675        }
1676
1677        // Make sure the v-string isn't followed by identifier-continuation characters
1678        // (e.g. `v5x` should remain an identifier, not a v-string `v5` + `x`)
1679        if pos < bytes.len() {
1680            let next_byte = bytes[pos];
1681            if next_byte == b'_' || next_byte.is_ascii_alphabetic() {
1682                return None;
1683            }
1684            // Also check for non-ASCII identifier continuations
1685            if next_byte >= 128
1686                && let Some(ch) = self.input.get(pos..).and_then(|s| s.chars().next())
1687                && is_perl_identifier_continue(ch)
1688            {
1689                return None;
1690            }
1691        }
1692
1693        // `v5` (no dots) is a valid Perl v-string meaning chr(5).
1694        let text = &self.input[start..pos];
1695
1696        self.position = pos;
1697        self.mode = LexerMode::ExpectOperator;
1698
1699        Some(Token {
1700            token_type: TokenType::Version(Arc::from(text)),
1701            text: Arc::from(text),
1702            start,
1703            end: self.position,
1704        })
1705    }
1706
1707    #[inline]
1708    fn apostrophe_starts_legacy_package_segment(&self, position: usize) -> bool {
1709        let next_position = position + '\''.len_utf8();
1710        self.input
1711            .get(next_position..)
1712            .and_then(|suffix| suffix.chars().next())
1713            .is_some_and(is_perl_identifier_start)
1714    }
1715
1716    #[inline]
1717    fn try_identifier_or_keyword(&mut self) -> Option<Token> {
1718        let start = self.position;
1719        let ch = self.current_char()?;
1720        let bytes = self.input_bytes;
1721        let len = bytes.len();
1722
1723        if is_perl_identifier_start(ch) {
1724            // Special case: substitution/transliteration with single-quote delimiter
1725            // The single quote is considered an identifier continuation, so we need to
1726            // detect these operators before consuming it as part of an identifier.
1727            if !self.after_arrow
1728                && self.hash_brace_depth == 0
1729                && ch == 's'
1730                && self.peek_char(1) == Some('\'')
1731            {
1732                self.advance(); // consume 's'
1733                return self.parse_substitution(start);
1734            } else if !self.after_arrow
1735                && self.hash_brace_depth == 0
1736                && ch == 'y'
1737                && self.peek_char(1) == Some('\'')
1738            {
1739                self.advance(); // consume 'y'
1740                return self.parse_transliteration(start);
1741            } else if !self.after_arrow
1742                && self.hash_brace_depth == 0
1743                && ch == 't'
1744                && self.peek_char(1) == Some('r')
1745                && self.peek_char(2) == Some('\'')
1746            {
1747                self.advance(); // consume 't'
1748                self.advance(); // consume 'r'
1749                return self.parse_transliteration(start);
1750            }
1751
1752            // Fast ASCII path for identifier continuation.
1753            while self.position < len {
1754                let byte = bytes[self.position];
1755                if byte == b'\'' {
1756                    if is_quote_op_word_prefix(&bytes[start..self.position])
1757                        || !self.apostrophe_starts_legacy_package_segment(self.position)
1758                    {
1759                        // Keep apostrophe for quote/string parsing in cases like q'...'
1760                        // and split' ', while still accepting Foo'Bar package spelling.
1761                        break;
1762                    }
1763                    self.position += 1;
1764                    continue;
1765                }
1766
1767                if byte.is_ascii_alphanumeric() || byte == b'_' {
1768                    self.position += 1;
1769                    continue;
1770                }
1771
1772                if byte < 128 {
1773                    break;
1774                }
1775
1776                if let Some(ch) = self.current_char()
1777                    && is_perl_identifier_continue(ch)
1778                {
1779                    self.advance();
1780                    continue;
1781                }
1782                break;
1783            }
1784            // Handle package-qualified identifiers like Foo::bar.
1785            while self.config.max_lookahead >= 1
1786                && self.position + 1 < len
1787                && bytes[self.position] == b':'
1788                && bytes[self.position + 1] == b':'
1789            {
1790                self.position += 2; // consume '::'
1791
1792                // consume following identifier segment if present
1793                let Some(ch) = self.current_char() else {
1794                    break;
1795                };
1796                if !is_perl_identifier_start(ch) {
1797                    break;
1798                }
1799                self.advance();
1800                while self.position < len {
1801                    let byte = bytes[self.position];
1802                    if byte == b'\'' {
1803                        if !self.apostrophe_starts_legacy_package_segment(self.position) {
1804                            break;
1805                        }
1806                        self.position += 1;
1807                        continue;
1808                    }
1809
1810                    if byte.is_ascii_alphanumeric() || byte == b'_' {
1811                        self.position += 1;
1812                        continue;
1813                    }
1814                    if byte < 128 {
1815                        break;
1816                    }
1817                    if let Some(ch) = self.current_char()
1818                        && is_perl_identifier_continue(ch)
1819                    {
1820                        self.advance();
1821                        continue;
1822                    }
1823                    break;
1824                }
1825            }
1826
1827            let text = &self.input[start..self.position];
1828
1829            // Check for __DATA__ and __END__ markers using exact match
1830            // Only recognize these in code channel, not inside data/format sections or heredocs
1831            let in_code_channel =
1832                !matches!(self.mode, LexerMode::InDataSection | LexerMode::InFormatBody)
1833                    && self.pending_heredocs.is_empty();
1834
1835            let marker = if in_code_channel {
1836                if text == "__DATA__" {
1837                    Some("__DATA__")
1838                } else if text == "__END__" {
1839                    Some("__END__")
1840                } else {
1841                    None
1842                }
1843            } else {
1844                None
1845            };
1846
1847            if let Some(marker_text) = marker {
1848                // These must be at the beginning of a line
1849                // Use the after_newline flag to determine if we're at line start
1850                if self.after_newline {
1851                    // Check if rest of line is only whitespace
1852                    // Only treat as data marker if line has no trailing junk
1853                    if Self::trailing_ws_only(self.input_bytes, self.position) {
1854                        // Consume the rest of the line (the marker line)
1855                        while self.position < self.input.len()
1856                            && self.input_bytes[self.position] != b'\n'
1857                            && self.input_bytes[self.position] != b'\r'
1858                        {
1859                            self.advance();
1860                        }
1861                        self.consume_newline();
1862
1863                        // Switch to data section mode
1864                        self.mode = LexerMode::InDataSection;
1865
1866                        return Some(Token {
1867                            token_type: TokenType::DataMarker(Arc::from(marker_text)),
1868                            text: Arc::from(marker_text),
1869                            start,
1870                            end: self.position,
1871                        });
1872                    }
1873                }
1874            }
1875
1876            // Check for substitution/transliteration operators
1877            // Skip if after '->'  -- these are method names, not operators.
1878            #[allow(clippy::collapsible_if)]
1879            if !self.after_arrow && self.hash_brace_depth == 0 && matches!(text, "s" | "tr" | "y") {
1880                let immediate = self.current_char();
1881                let (candidate, char_after_next, has_whitespace) =
1882                    if immediate.is_some_and(|c| c.is_whitespace()) {
1883                        let (nc, ca) = self.peek_nonspace_and_following();
1884                        (nc, ca, true)
1885                    } else {
1886                        let following = immediate.and_then(|c| {
1887                            let j = self.position + c.len_utf8();
1888                            self.input.get(j..).and_then(|s| s.chars().next())
1889                        });
1890                        (immediate, following, false)
1891                    };
1892
1893                if let Some(next) = candidate {
1894                    // `s => 1` should remain a fat-arrow hash key, not quote op.
1895                    let is_fat_arrow = next == '=' && char_after_next == Some('>');
1896                    let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
1897                    let is_quote_char = matches!(next, '\'' | '"') && text != "s";
1898                    let transliteration_allows_whitespace = text == "tr" || text == "y";
1899                    let substitution_disallows_whitespace = text == "s" && has_whitespace;
1900                    let is_valid_delim = Self::is_quote_delim(next)
1901                        && !is_fat_arrow
1902                        && !substitution_disallows_whitespace
1903                        && (!has_whitespace
1904                            || is_paired_delim
1905                            || is_quote_char
1906                            || transliteration_allows_whitespace);
1907
1908                    if is_valid_delim {
1909                        match text {
1910                            "s" => return self.parse_substitution(start),
1911                            "tr" | "y" => return self.parse_transliteration(start),
1912                            unexpected => {
1913                                return Some(Token {
1914                                    token_type: TokenType::Error(Arc::from(format!(
1915                                        "Unexpected substitution operator '{}': expected 's', 'tr', or 'y' at position {}",
1916                                        unexpected, start
1917                                    ))),
1918                                    text: Arc::from(unexpected),
1919                                    start,
1920                                    end: self.position,
1921                                });
1922                            }
1923                        }
1924                    }
1925                }
1926            }
1927
1928            let token_type = if is_keyword_fast(text) {
1929                // Check for special keywords that affect lexer mode
1930                match text {
1931                    "if" | "unless" | "while" | "until" | "for" | "foreach" | "grep" | "map"
1932                    | "sort" | "split" | "and" | "or" | "xor" | "not"
1933                    // These keywords introduce an expression, so a following `/` is a
1934                    // regex, not division.  `return /re/`, `die /re/`, `warn /re/`,
1935                    // `do /file/`, and `eval /re/` are all valid Perl.
1936                    | "return" | "die" | "warn" | "do" | "eval" => {
1937                        self.mode = LexerMode::ExpectTerm;
1938                    }
1939                    "sub" => {
1940                        self.after_sub = true;
1941                        self.mode = LexerMode::ExpectTerm;
1942                    }
1943                    // Quote operators expect a delimiter next.
1944                    // Skip if after '->' -- these are method names, not operators.
1945                    // Skip inside hash subscript braces (hash_brace_depth > 0) — all
1946                    // positions inside `$h{...}` or `@h{...}` treat quote-op names as
1947                    // bareword keys, including after commas in slices like `@h{m, s}`.
1948                    op if !self.after_arrow
1949                        && self.hash_brace_depth == 0
1950                        && quote_handler::is_quote_operator(op) =>
1951                    {
1952                        // Perl allows whitespace between a quote-like operator and its delimiter,
1953                        // but ONLY for paired delimiters (s { ... } { ... }g).
1954                        // For non-paired delimiters (s/foo/bar/, s,foo,bar,), the delimiter
1955                        // must be immediately adjacent — otherwise `s $foo` would wrongly
1956                        // treat `$` as a delimiter instead of being a bareword `s` followed
1957                        // by a scalar variable.
1958                        //
1959                        // Strategy:
1960                        //   1. Check the immediately-adjacent char first (no whitespace skip).
1961                        //      If it is a valid delimiter → any non-alnum, non-whitespace char.
1962                        //   2. If the adjacent char is whitespace, peek past it.
1963                        //      Only accept PAIRED delimiters ({, [, (, <) in that case.
1964                        let immediate = self.current_char();
1965                        let (candidate, char_after_next, has_whitespace) =
1966                            if immediate.is_some_and(|c| c.is_whitespace()) {
1967                                // There is whitespace — peek past it
1968                                let (nc, ca) = self.peek_nonspace_and_following();
1969                                (nc, ca, true)
1970                            } else {
1971                                // No whitespace — use immediate char
1972                                let following = immediate.and_then(|c| {
1973                                    let j = self.position + c.len_utf8();
1974                                    self.input.get(j..).and_then(|s| s.chars().next())
1975                                });
1976                                (immediate, following, false)
1977                            };
1978
1979                        if let Some(next) = candidate {
1980                            // Fat-arrow autoquoting: `s => value` — `=` followed by `>` is '=>',
1981                            // not a valid substitution delimiter. Treat as identifier.
1982                            let is_fat_arrow = next == '=' && char_after_next == Some('>');
1983
1984                            // When whitespace precedes the delimiter, only unambiguous
1985                            // delimiters are accepted:
1986                            //   - Paired delimiters ({, [, (, <) are always safe.
1987                            //   - ' and " are safe for all operators EXCEPT `s` — `-s 'filename'`
1988                            //     is a valid file-size filetest and must not be treated as a
1989                            //     substitution start. All other operators (qw, q, qq, qr, qx, m,
1990                            //     tr, y) have no corresponding file-test operator.
1991                            //   - / is safe for non-substitution quote operators; `qw /a b/` and
1992                            //     `m /re/` are common, while `s /foo/bar/` remains ambiguous with
1993                            //     the file-size test shape and stays rejected here.
1994                            //   - Non-paired, non-quote chars ($, @, ,, etc.) remain rejected.
1995                            let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
1996                            let is_quote_char = matches!(next, '\'' | '"') && op != "s";
1997                            let is_spaced_slash_delim = next == '/' && op != "s";
1998                            let is_valid_delim = Self::is_quote_delim(next)
1999                                && !is_fat_arrow
2000                                && (!has_whitespace
2001                                    || is_paired_delim
2002                                    || is_quote_char
2003                                    || is_spaced_slash_delim);
2004
2005                            if is_valid_delim {
2006                                self.mode = LexerMode::ExpectDelimiter;
2007                                self.current_quote_op = Some(quote_handler::QuoteOperatorInfo {
2008                                    operator: op.to_string(),
2009                                    delimiter: '\0', // Will be set when we see the delimiter
2010                                    start_pos: start,
2011                                });
2012
2013                                // Don't return a keyword token - continue to parse the delimiter
2014                                // Skip any whitespace between operator and delimiter
2015                                while let Some(ch) = self.current_char() {
2016                                    if ch.is_whitespace() {
2017                                        self.advance();
2018                                    } else {
2019                                        break;
2020                                    }
2021                                }
2022
2023                                // Get the delimiter
2024                                #[allow(clippy::collapsible_if)]
2025                                if let Some(delim) = self.current_char() {
2026                                    if !delim.is_alphanumeric() {
2027                                        self.advance();
2028                                        if let Some(ref mut info) = self.current_quote_op {
2029                                            info.delimiter = delim;
2030                                        }
2031                                        // Parse the quote operator content and return the complete token
2032                                        return self.parse_quote_operator(delim);
2033                                    }
2034                                }
2035                            } else {
2036                                // Not a quote operator here → treat as IDENTIFIER
2037                                self.current_quote_op = None;
2038                                self.mode = LexerMode::ExpectOperator;
2039                                return Some(Token {
2040                                    token_type: TokenType::Identifier(Arc::from(text)),
2041                                    start,
2042                                    end: self.position,
2043                                    text: Arc::from(text),
2044                                });
2045                            }
2046                        } else {
2047                            // End-of-input after the word → also treat as IDENTIFIER
2048                            self.current_quote_op = None;
2049                            self.mode = LexerMode::ExpectOperator;
2050                            return Some(Token {
2051                                token_type: TokenType::Identifier(Arc::from(text)),
2052                                start,
2053                                end: self.position,
2054                                text: Arc::from(text),
2055                            });
2056                        }
2057                        // If we get here but haven't returned, something went wrong
2058                        // Fall through to treat as identifier
2059                        self.current_quote_op = None;
2060                        self.mode = LexerMode::ExpectOperator;
2061                        return Some(Token {
2062                            token_type: TokenType::Identifier(Arc::from(text)),
2063                            start,
2064                            end: self.position,
2065                            text: Arc::from(text),
2066                        });
2067                    }
2068                    // Format declarations need special handling
2069                    "format" => {
2070                        // We'll need to check for the = after the format name
2071                        // For now, just mark that we saw format
2072                    }
2073                    _ if is_builtin_function(text) => {
2074                        // Bare builtins are term-introducing in Perl.
2075                        self.mode = LexerMode::ExpectTerm;
2076                    }
2077                    _ => {
2078                        self.mode = LexerMode::ExpectOperator;
2079                    }
2080                }
2081                TokenType::Keyword(Arc::from(text))
2082            } else {
2083                // Mirror parser bare-builtin handling so `/` after builtins like
2084                // `join` or `print` is lexed as a regex term, not division.
2085                if is_builtin_function(text) {
2086                    self.mode = LexerMode::ExpectTerm;
2087                } else {
2088                    self.mode = LexerMode::ExpectOperator;
2089                }
2090                TokenType::Identifier(Arc::from(text))
2091            };
2092
2093            self.after_arrow = false;
2094            // A keyword/identifier is not a variable; `{` after it is a block opener.
2095            self.after_var_subscript = false;
2096            // hash_brace_depth is managed by { and } handlers, not cleared per-token
2097            Some(Token { token_type, text: Arc::from(text), start, end: self.position })
2098        } else {
2099            None
2100        }
2101    }
2102
2103    /// Parse data section body - consumes everything to EOF
2104    fn parse_data_body(&mut self) -> Option<Token> {
2105        if self.position >= self.input.len() {
2106            // Already at EOF
2107            self.mode = LexerMode::ExpectTerm;
2108            return Some(Token {
2109                token_type: TokenType::EOF,
2110                text: Arc::from(""),
2111                start: self.position,
2112                end: self.position,
2113            });
2114        }
2115
2116        let start = self.position;
2117        // Consume everything to EOF
2118        let body = &self.input[self.position..];
2119        self.position = self.input.len();
2120
2121        // Reset mode for next parse (though we're at EOF)
2122        self.mode = LexerMode::ExpectTerm;
2123
2124        Some(Token {
2125            token_type: TokenType::DataBody(Arc::from(body)),
2126            text: Arc::from(body),
2127            start,
2128            end: self.position,
2129        })
2130    }
2131
2132    /// Parse format body - consumes until a line with just a dot
2133    fn parse_format_body(&mut self) -> Option<Token> {
2134        let start = self.position;
2135        let mut body = String::new();
2136        let mut line_start = true;
2137
2138        while self.position < self.input.len() {
2139            // Check if we're at the start of a line and the next char is a dot
2140            if line_start && self.current_char() == Some('.') {
2141                // Check if this line contains only a dot
2142                let mut peek_pos = self.position + 1;
2143                let mut found_terminator = true;
2144
2145                // Skip any trailing whitespace on the dot line
2146                while peek_pos < self.input.len() {
2147                    match self.input_bytes[peek_pos] {
2148                        b' ' | b'\t' | b'\r' => peek_pos += 1,
2149                        b'\n' => break,
2150                        _ => {
2151                            found_terminator = false;
2152                            break;
2153                        }
2154                    }
2155                }
2156
2157                if found_terminator {
2158                    // We found the terminating dot, consume it
2159                    self.position = peek_pos;
2160                    if self.position < self.input.len() && self.input_bytes[self.position] == b'\n'
2161                    {
2162                        self.position += 1;
2163                    }
2164
2165                    // Switch back to normal mode
2166                    self.mode = LexerMode::ExpectTerm;
2167
2168                    return Some(Token {
2169                        token_type: TokenType::FormatBody(Arc::from(body.clone())),
2170                        text: Arc::from(body),
2171                        start,
2172                        end: self.position,
2173                    });
2174                }
2175            }
2176
2177            // Not a terminator, consume the character
2178            match self.current_char() {
2179                Some(ch) => {
2180                    body.push(ch);
2181                    self.advance();
2182
2183                    // Track if we're at the start of a line
2184                    line_start = ch == '\n';
2185                }
2186                None => {
2187                    // Reached EOF without finding terminator
2188                    break;
2189                }
2190            }
2191        }
2192
2193        // If we reach here, we didn't find a terminator
2194        self.mode = LexerMode::ExpectTerm;
2195        Some(Token {
2196            token_type: TokenType::Error(Arc::from("Unterminated format body")),
2197            text: Arc::from(body),
2198            start,
2199            end: self.position,
2200        })
2201    }
2202
2203    fn try_operator(&mut self) -> Option<Token> {
2204        // Skip operator parsing if we're expecting a delimiter for a quote operator
2205        if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2206            return None;
2207        }
2208
2209        let start = self.position;
2210        let ch = self.current_char()?;
2211
2212        // ═══════════════════════════════════════════════════════════════════════
2213        // SLASH DISAMBIGUATION STRATEGY (Issue #422)
2214        // ═══════════════════════════════════════════════════════════════════════
2215        //
2216        // Perl's `/` character is ambiguous:
2217        //   - Division operator: `$x / 2`
2218        //   - Regex delimiter: `/pattern/`
2219        //   - Defined-or operator: `$x // $y`
2220        //
2221        // **Disambiguation Strategy (Context-Aware Heuristics):**
2222        //
2223        // 1. **Mode-Based Decision (Primary)**:
2224        //    - `LexerMode::ExpectTerm` → `/` starts a regex
2225        //      Examples: `if (/pattern/)`, `=~ /test/`, `( /regex/`
2226        //    - `LexerMode::ExpectOperator` → `/` is division or `//`
2227        //      Examples: `$x / 2`, `$x // $y`, `) / 3`
2228        //
2229        // 2. **Context Heuristics (Secondary - Implicit in Mode)**:
2230        //    Mode is set based on previous token:
2231        //    - After identifier/number/closing paren → ExpectOperator → division
2232        //    - After operator/keyword/opening paren → ExpectTerm → regex
2233        //
2234        // 3. **Budget Protection**:
2235        //    - Regex parsing has a parse-step budget and byte budget
2236        //    - Budget exceeded → emit UnknownRest token (graceful degradation)
2237        //    - See `parse_regex()` and `budget_guard()` for implementation
2238        //
2239        // 4. **Performance Characteristics**:
2240        //    - Single-pass: O(1) decision based on mode flag
2241        //    - No backtracking: Mode updated after each token
2242        //    - Optimized: Byte-level operations for common cases
2243        //
2244        // **Metrics & Monitoring**:
2245        //    - Budget exceeded events tracked via UnknownRest token emission
2246        //    - LSP diagnostics generated for truncated regexes
2247        //    - Test coverage: lexer_slash_timeout_tests.rs (21 test cases)
2248        //
2249        // ═══════════════════════════════════════════════════════════════════════
2250
2251        if ch == '/' {
2252            if self.mode == LexerMode::ExpectTerm {
2253                // Mode indicates we're expecting a term → `/` starts a regex
2254                // Examples: `if (/pattern/)`, `=~ /test/`, `while (/match/)`
2255                return self.parse_regex(start);
2256            } else {
2257                // Mode indicates we're expecting an operator → `/` is division or `//`
2258                // Examples: `$x / 2`, `$x // $y`, `10 / 3`
2259                self.advance();
2260                // Check for // or //= using byte-level operations for speed
2261                if self.peek_byte(0) == Some(b'/') {
2262                    self.position += 1; // consume second / directly
2263                    if self.peek_byte(0) == Some(b'=') {
2264                        self.position += 1; // consume = directly
2265                        let text = &self.input[start..self.position];
2266                        self.mode = LexerMode::ExpectTerm;
2267                        return Some(Token {
2268                            token_type: TokenType::Operator(Arc::from(text)),
2269                            text: Arc::from(text),
2270                            start,
2271                            end: self.position,
2272                        });
2273                    } else {
2274                        // Use cached string for common "//" operator
2275                        self.mode = LexerMode::ExpectTerm;
2276                        return Some(Token {
2277                            token_type: TokenType::Operator(Arc::from("//")),
2278                            text: Arc::from("//"),
2279                            start,
2280                            end: self.position,
2281                        });
2282                    }
2283                } else if self.position < self.input_bytes.len()
2284                    && self.input_bytes[self.position] == b'='
2285                {
2286                    // /= division-assign operator
2287                    self.position += 1; // consume =
2288                    self.mode = LexerMode::ExpectTerm;
2289                    return Some(Token {
2290                        token_type: TokenType::Operator(Arc::from("/=")),
2291                        text: Arc::from("/="),
2292                        start,
2293                        end: self.position,
2294                    });
2295                } else {
2296                    // Use cached string for common "/" division
2297                    self.mode = LexerMode::ExpectTerm;
2298                    return Some(Token {
2299                        token_type: TokenType::Division,
2300                        text: Arc::from("/"),
2301                        start,
2302                        end: self.position,
2303                    });
2304                }
2305            }
2306        }
2307
2308        // Handle other operators - simplified
2309        match ch {
2310            '.' => {
2311                // Check if it's a decimal number like .5 -- but only when we
2312                // expect a term.  In operator position `.5` is concatenation
2313                // of the bareword/number on the left with the number `5`.
2314                if self.mode != LexerMode::ExpectOperator
2315                    && self.peek_char(1).is_some_and(|c| c.is_ascii_digit())
2316                {
2317                    return self.parse_decimal_number(start);
2318                }
2319                self.advance();
2320                // Check for compound operators
2321                #[allow(clippy::collapsible_if)]
2322                if let Some(next) = self.current_char() {
2323                    if is_compound_operator(ch, next) {
2324                        self.advance();
2325
2326                        // Check for three-character operators like **=, <<=, >>=
2327                        if self.position < self.input.len() {
2328                            let third = self.current_char();
2329                            // Check for three-character operators
2330                            if matches!(
2331                                (ch, next, third),
2332                                ('*', '*', Some('='))
2333                                    | ('<', '<', Some('='))
2334                                    | ('>', '>', Some('='))
2335                                    | ('&', '&', Some('='))
2336                                    | ('|', '|', Some('='))
2337                                    | ('/', '/', Some('='))
2338                            ) {
2339                                self.advance(); // consume the =
2340                            } else if ch == '<' && next == '=' && third == Some('>') {
2341                                self.advance(); // consume the >
2342                            // Special case: <=> spaceship operator
2343                            } else if ch == '.' && next == '.' && third == Some('.') {
2344                                self.advance(); // consume the third .
2345                            }
2346                        }
2347                    }
2348                }
2349            }
2350            '+' | '-' | '*' | '%' | '&' | '|' | '^' | '~' | '!' | '=' | '<' | '>' | ':' | '?'
2351            | '\\' => {
2352                self.advance();
2353                // Check for compound operators
2354                #[allow(clippy::collapsible_if)]
2355                if let Some(next) = self.current_char() {
2356                    if is_compound_operator(ch, next) {
2357                        self.advance();
2358
2359                        // Check for three-character operators like **=, <<=, >>=
2360                        if self.position < self.input.len() {
2361                            let third = self.current_char();
2362                            // Check for three-character operators
2363                            if matches!(
2364                                (ch, next, third),
2365                                ('*', '*', Some('='))
2366                                    | ('<', '<', Some('='))
2367                                    | ('>', '>', Some('='))
2368                                    | ('&', '&', Some('='))
2369                                    | ('|', '|', Some('='))
2370                                    | ('/', '/', Some('='))
2371                            ) {
2372                                self.advance(); // consume the =
2373                            } else if ch == '<' && next == '=' && third == Some('>') {
2374                                self.advance(); // consume the >
2375                                // Special case: <=> spaceship operator
2376                            }
2377                        }
2378                    }
2379                }
2380            }
2381            _ => return None,
2382        }
2383
2384        let text = &self.input[start..self.position];
2385        // Operator ends prototype window (e.g. `:` for attributes)
2386        self.after_sub = false;
2387        // Track whether this operator is '->' for method name disambiguation
2388        self.after_arrow = text == "->";
2389        // Any operator token ends the "just saw a variable" window; `{` after
2390        // an operator is not a hash subscript (e.g. `foo() {`, `+ {`, etc.).
2391        self.after_var_subscript = false;
2392        // Postfix ++ and -- complete a term expression, so next token is an operator
2393        // (e.g., "$x++ / 2" → / is division, not regex)
2394        if (text == "++" || text == "--") && self.mode == LexerMode::ExpectOperator {
2395            // Postfix: stay in ExpectOperator
2396        } else {
2397            self.mode = LexerMode::ExpectTerm;
2398        }
2399
2400        Some(Token {
2401            token_type: TokenType::Operator(Arc::from(text)),
2402            text: Arc::from(text),
2403            start,
2404            end: self.position,
2405        })
2406    }
2407
2408    fn try_delimiter(&mut self) -> Option<Token> {
2409        let start = self.position;
2410        let ch = self.current_char()?;
2411
2412        // If we're expecting a delimiter for a quote operator, handle it specially
2413        if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2414            // Accept any non-alphanumeric character as a delimiter
2415            if !ch.is_alphanumeric() && !ch.is_whitespace() {
2416                self.advance();
2417                if let Some(ref mut info) = self.current_quote_op {
2418                    info.delimiter = ch;
2419                }
2420                // Now parse the quote operator content
2421                return self.parse_quote_operator(ch);
2422            }
2423        }
2424
2425        match ch {
2426            '(' => {
2427                // Check if this is a quote operator delimiter
2428                if matches!(self.mode, LexerMode::ExpectDelimiter)
2429                    && self.current_quote_op.is_some()
2430                {
2431                    self.advance();
2432                    if let Some(ref mut info) = self.current_quote_op {
2433                        info.delimiter = ch;
2434                    }
2435                    return self.parse_quote_operator(ch);
2436                }
2437
2438                self.advance();
2439                if self.after_sub {
2440                    // Promote after_sub to in_prototype now that we see '('
2441                    self.in_prototype = true;
2442                    self.after_sub = false;
2443                    self.prototype_depth = 1;
2444                } else if self.in_prototype {
2445                    self.prototype_depth += 1;
2446                }
2447                self.paren_depth += 1;
2448                self.after_var_subscript = false;
2449                self.mode = LexerMode::ExpectTerm;
2450                Some(Token {
2451                    token_type: TokenType::LeftParen,
2452                    text: Arc::from("("),
2453                    start,
2454                    end: self.position,
2455                })
2456            }
2457            ')' => {
2458                self.advance();
2459                if self.in_prototype && self.prototype_depth > 0 {
2460                    self.prototype_depth -= 1;
2461                    if self.prototype_depth == 0 {
2462                        self.in_prototype = false;
2463                    }
2464                }
2465                self.after_arrow = false;
2466                self.paren_depth = self.paren_depth.saturating_sub(1);
2467                // A closing paren ends any var-subscript context: `if ($var)` should
2468                // NOT leave after_var_subscript set, otherwise the following `{` would
2469                // incorrectly increment hash_brace_depth and suppress regex operators
2470                // inside the block body (issue #2844).
2471                self.after_var_subscript = false;
2472                self.mode = LexerMode::ExpectOperator;
2473                Some(Token {
2474                    token_type: TokenType::RightParen,
2475                    text: Arc::from(")"),
2476                    start,
2477                    end: self.position,
2478                })
2479            }
2480            ';' => {
2481                self.advance();
2482                // Semicolon ends prototype window (forward declaration)
2483                self.after_sub = false;
2484                // Semicolon is a statement boundary — any pending method-call chain is over.
2485                self.after_arrow = false;
2486                self.after_var_subscript = false;
2487                self.mode = LexerMode::ExpectTerm;
2488                Some(Token {
2489                    token_type: TokenType::Semicolon,
2490                    text: Arc::from(";"),
2491                    start,
2492                    end: self.position,
2493                })
2494            }
2495            ',' => {
2496                self.advance();
2497                self.after_var_subscript = false;
2498                self.mode = LexerMode::ExpectTerm;
2499                Some(Token {
2500                    token_type: TokenType::Comma,
2501                    text: Arc::from(","),
2502                    start,
2503                    end: self.position,
2504                })
2505            }
2506            '[' => {
2507                self.advance();
2508                self.after_var_subscript = false;
2509                self.mode = LexerMode::ExpectTerm;
2510                Some(Token {
2511                    token_type: TokenType::LeftBracket,
2512                    text: Arc::from("["),
2513                    start,
2514                    end: self.position,
2515                })
2516            }
2517            ']' => {
2518                self.advance();
2519                // A closing `]` from an array subscript leaves us in a state where
2520                // a `{` immediately following is a hash subscript — e.g. `$arr[$i]{key}`.
2521                // Set after_var_subscript so the `{` handler recognises it as such.
2522                // This mirrors the `}` handler's behavior when closing a hash subscript.
2523                self.after_var_subscript = true;
2524                self.mode = LexerMode::ExpectOperator;
2525                Some(Token {
2526                    token_type: TokenType::RightBracket,
2527                    text: Arc::from("]"),
2528                    start,
2529                    end: self.position,
2530                })
2531            }
2532            '{' => {
2533                self.advance();
2534                // Opening brace ends prototype window — no prototype follows
2535                self.after_sub = false;
2536                // `{` is a hash/slice subscript opener only when it immediately follows
2537                // a variable token ($x, @x, %x) — tracked by `after_var_subscript`.
2538                // This is narrower than the old `mode == ExpectOperator` check, which
2539                // incorrectly incremented depth for block-opening braces after `sub foo`,
2540                // `if (cond)`, `else`, `while (cond)`, etc., causing quote-op suppression
2541                // inside those block bodies and breaking m//, s///, qr//, tr/// etc.
2542                if self.after_var_subscript {
2543                    self.hash_brace_depth = self.hash_brace_depth.saturating_add(1);
2544                }
2545                self.after_var_subscript = false;
2546                self.mode = LexerMode::ExpectTerm;
2547                Some(Token {
2548                    token_type: TokenType::LeftBrace,
2549                    text: Arc::from("{"),
2550                    start,
2551                    end: self.position,
2552                })
2553            }
2554            '}' => {
2555                self.advance();
2556                self.after_arrow = false;
2557                // Decrement hash subscript brace depth only if we were inside one.
2558                // If depth > 0, this closes a hash subscript; enable chained subscripts
2559                // like $h{a}{b} by setting after_var_subscript so the next `{` is
2560                // recognized as another subscript opener.
2561                if self.hash_brace_depth > 0 {
2562                    self.hash_brace_depth -= 1;
2563                    // The subscript value is now the "variable" for a chained subscript.
2564                    self.after_var_subscript = true;
2565                } else {
2566                    // Block-close `}` — no subscript follows
2567                    self.after_var_subscript = false;
2568                }
2569                self.mode = LexerMode::ExpectOperator;
2570                Some(Token {
2571                    token_type: TokenType::RightBrace,
2572                    text: Arc::from("}"),
2573                    start,
2574                    end: self.position,
2575                })
2576            }
2577            '#' => {
2578                // Only treat as delimiter in ExpectDelimiter mode
2579                if matches!(self.mode, LexerMode::ExpectDelimiter) {
2580                    self.advance();
2581                    // Reset mode after consuming delimiter
2582                    self.mode = LexerMode::ExpectTerm;
2583                    Some(Token {
2584                        token_type: TokenType::Operator(Arc::from("#")),
2585                        text: Arc::from("#"),
2586                        start,
2587                        end: self.position,
2588                    })
2589                } else {
2590                    None
2591                }
2592            }
2593            _ => None,
2594        }
2595    }
2596
2597    fn parse_double_quoted_string(&mut self, start: usize) -> Option<Token> {
2598        self.advance(); // Skip opening quote
2599        let mut parts = Vec::new();
2600        let mut current_literal = String::new();
2601        let mut last_pos = self.position;
2602
2603        while let Some(ch) = self.current_char() {
2604            match ch {
2605                '"' => {
2606                    self.advance();
2607                    if !current_literal.is_empty() {
2608                        parts.push(StringPart::Literal(Arc::from(current_literal)));
2609                    }
2610
2611                    let text = &self.input[start..self.position];
2612                    self.mode = LexerMode::ExpectOperator;
2613
2614                    return Some(Token {
2615                        token_type: if parts.is_empty() {
2616                            TokenType::StringLiteral
2617                        } else {
2618                            TokenType::InterpolatedString(parts)
2619                        },
2620                        text: Arc::from(text),
2621                        start,
2622                        end: self.position,
2623                    });
2624                }
2625                '\\' => {
2626                    self.advance();
2627                    if let Some(escaped) = self.current_char() {
2628                        // Optimize by reserving space to avoid frequent reallocations
2629                        if current_literal.capacity() == 0 {
2630                            current_literal.reserve(32);
2631                        }
2632                        current_literal.push('\\');
2633                        current_literal.push(escaped);
2634                        self.advance();
2635                    }
2636                }
2637                '$' if self.config.parse_interpolation => {
2638                    // Handle variable interpolation - avoid unnecessary clone
2639                    if !current_literal.is_empty() {
2640                        parts.push(StringPart::Literal(Arc::from(current_literal)));
2641                        current_literal = String::new(); // Clear without cloning
2642                    }
2643
2644                    let part_start = self.position;
2645                    self.advance();
2646                    match self.current_char() {
2647                        Some('{') => {
2648                            let _ = self.consume_balanced_segment_in_string('{', '}', '"');
2649                            parts.push(StringPart::Expression(Arc::from(
2650                                &self.input[part_start..self.position],
2651                            )));
2652                        }
2653                        Some(ch) if is_perl_identifier_start(ch) => {
2654                            let var_start = self.position;
2655
2656                            // Fast path for ASCII identifier continuation
2657                            while self.position < self.input_bytes.len() {
2658                                let byte = self.input_bytes[self.position];
2659                                if byte.is_ascii_alphanumeric() || byte == b'_' {
2660                                    self.position += 1;
2661                                } else if byte >= 128 {
2662                                    // Only use UTF-8 parsing for non-ASCII
2663                                    if let Some(ch) = self.current_char() {
2664                                        if is_perl_identifier_continue(ch) {
2665                                            self.advance();
2666                                        } else {
2667                                            break;
2668                                        }
2669                                    } else {
2670                                        break;
2671                                    }
2672                                } else {
2673                                    break;
2674                                }
2675                            }
2676
2677                            if self.position > var_start {
2678                                let var_name = &self.input[part_start..self.position];
2679                                parts.push(StringPart::Variable(Arc::from(var_name)));
2680
2681                                if self.matches_bytes(b"->") {
2682                                    let tail_start = self.position;
2683                                    self.advance();
2684                                    self.advance();
2685
2686                                    match self.current_char() {
2687                                        Some('[') => {
2688                                            let _ = self
2689                                                .consume_balanced_segment_in_string('[', ']', '"');
2690                                            parts.push(StringPart::MethodCall(Arc::from(
2691                                                &self.input[tail_start..self.position],
2692                                            )));
2693                                        }
2694                                        Some('{') => {
2695                                            let _ = self
2696                                                .consume_balanced_segment_in_string('{', '}', '"');
2697                                            parts.push(StringPart::MethodCall(Arc::from(
2698                                                &self.input[tail_start..self.position],
2699                                            )));
2700                                        }
2701                                        Some('(') => {
2702                                            let _ = self
2703                                                .consume_balanced_segment_in_string('(', ')', '"');
2704                                            parts.push(StringPart::MethodCall(Arc::from(
2705                                                &self.input[tail_start..self.position],
2706                                            )));
2707                                        }
2708                                        Some(ch) if is_perl_identifier_start(ch) => {
2709                                            while self.position < self.input_bytes.len() {
2710                                                let byte = self.input_bytes[self.position];
2711                                                if byte.is_ascii_alphanumeric() || byte == b'_' {
2712                                                    self.position += 1;
2713                                                } else if byte >= 128 {
2714                                                    if let Some(ch) = self.current_char() {
2715                                                        if is_perl_identifier_continue(ch) {
2716                                                            self.advance();
2717                                                        } else {
2718                                                            break;
2719                                                        }
2720                                                    } else {
2721                                                        break;
2722                                                    }
2723                                                } else {
2724                                                    break;
2725                                                }
2726                                            }
2727                                            if self.current_char() == Some('(') {
2728                                                let _ = self.consume_balanced_segment_in_string(
2729                                                    '(', ')', '"',
2730                                                );
2731                                            }
2732                                            parts.push(StringPart::MethodCall(Arc::from(
2733                                                &self.input[tail_start..self.position],
2734                                            )));
2735                                        }
2736                                        _ => {
2737                                            parts.push(StringPart::MethodCall(Arc::from(
2738                                                &self.input[tail_start..self.position],
2739                                            )));
2740                                        }
2741                                    }
2742                                } else if self.current_char() == Some('[') {
2743                                    let tail_start = self.position;
2744                                    let _ = self.consume_balanced_segment_in_string('[', ']', '"');
2745                                    parts.push(StringPart::ArraySlice(Arc::from(
2746                                        &self.input[tail_start..self.position],
2747                                    )));
2748                                } else if self.current_char() == Some('{') {
2749                                    let tail_start = self.position;
2750                                    let _ = self.consume_balanced_segment_in_string('{', '}', '"');
2751                                    parts.push(StringPart::Expression(Arc::from(
2752                                        &self.input[tail_start..self.position],
2753                                    )));
2754                                }
2755                            }
2756                        }
2757                        _ => {}
2758                    }
2759                }
2760                _ => {
2761                    // Optimize string building with better capacity management
2762                    if current_literal.capacity() == 0 {
2763                        current_literal.reserve(32);
2764                    }
2765                    current_literal.push(ch);
2766                    self.advance();
2767                }
2768            }
2769
2770            // Safety check: ensure we're making progress
2771            if self.position == last_pos {
2772                break;
2773            }
2774            last_pos = self.position;
2775        }
2776
2777        Some(self.unterminated_string_error(start))
2778    }
2779
2780    fn parse_single_quoted_string(&mut self, start: usize) -> Option<Token> {
2781        self.advance(); // Skip opening quote
2782
2783        let mut last_pos = self.position;
2784
2785        while let Some(ch) = self.current_char() {
2786            match ch {
2787                '\'' => {
2788                    self.advance();
2789                    let text = &self.input[start..self.position];
2790                    self.mode = LexerMode::ExpectOperator;
2791
2792                    return Some(Token {
2793                        token_type: TokenType::StringLiteral,
2794                        text: Arc::from(text),
2795                        start,
2796                        end: self.position,
2797                    });
2798                }
2799                '\\' => {
2800                    self.advance();
2801                    if self.current_char() == Some('\'') || self.current_char() == Some('\\') {
2802                        self.advance();
2803                    }
2804                }
2805                _ => self.advance(),
2806            }
2807
2808            // Safety check: ensure we're making progress
2809            if self.position == last_pos {
2810                break;
2811            }
2812            last_pos = self.position;
2813        }
2814
2815        Some(self.unterminated_string_error(start))
2816    }
2817
2818    fn parse_backtick_string(&mut self, start: usize) -> Option<Token> {
2819        self.advance(); // Skip opening backtick
2820
2821        let mut last_pos = self.position;
2822
2823        while let Some(ch) = self.current_char() {
2824            match ch {
2825                '`' => {
2826                    self.advance();
2827                    let text = &self.input[start..self.position];
2828                    self.mode = LexerMode::ExpectOperator;
2829
2830                    return Some(Token {
2831                        token_type: TokenType::QuoteCommand,
2832                        text: Arc::from(text),
2833                        start,
2834                        end: self.position,
2835                    });
2836                }
2837                '\\' => {
2838                    self.advance();
2839                    if self.current_char().is_some() {
2840                        self.advance();
2841                    }
2842                }
2843                _ => self.advance(),
2844            }
2845
2846            // Safety check: ensure we're making progress
2847            if self.position == last_pos {
2848                break;
2849            }
2850            last_pos = self.position;
2851        }
2852
2853        Some(self.unterminated_string_error(start))
2854    }
2855
2856    fn parse_q_string(&mut self, _start: usize) -> Option<Token> {
2857        // Simplified q-string parsing
2858        None
2859    }
2860
2861    #[inline]
2862    fn unterminated_string_error(&mut self, start: usize) -> Token {
2863        // Consume to EOF so the caller receives a single terminal error token.
2864        let end = self.input.len();
2865        self.position = end;
2866
2867        Token {
2868            token_type: TokenType::Error(Arc::from("unterminated string")),
2869            text: Arc::from(&self.input[start..end]),
2870            start,
2871            end,
2872        }
2873    }
2874
2875    fn parse_substitution(&mut self, start: usize) -> Option<Token> {
2876        // We've already consumed 's'
2877        let delimiter = self.current_char()?;
2878        self.advance(); // Skip delimiter
2879        self.parse_substitution_with_delimiter(start, delimiter)
2880    }
2881
2882    fn parse_substitution_with_delimiter(
2883        &mut self,
2884        start: usize,
2885        delimiter: char,
2886    ) -> Option<Token> {
2887        let (_pattern, pattern_closed) = self.read_delimited_body(delimiter);
2888        let replacement_closed;
2889
2890        let pattern_is_paired = quote_handler::paired_close(delimiter).is_some();
2891        if pattern_is_paired {
2892            self.skip_paired_substitution_replacement_gap();
2893
2894            if let Some(repl_delim) = self.current_char()
2895                && Self::is_quote_delim(repl_delim)
2896            {
2897                self.advance();
2898                let (_replacement, closed) = self.read_substitution_replacement_body(repl_delim);
2899                replacement_closed = closed;
2900            } else {
2901                replacement_closed = false;
2902            }
2903        } else {
2904            let (_replacement, closed) = self.read_substitution_replacement_body(delimiter);
2905            replacement_closed = closed;
2906        }
2907
2908        // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
2909        while let Some(ch) = self.current_char() {
2910            if ch.is_ascii_alphanumeric() {
2911                self.advance();
2912            } else {
2913                break;
2914            }
2915        }
2916
2917        let text = &self.input[start..self.position];
2918        self.mode = LexerMode::ExpectOperator;
2919
2920        let token_type = if pattern_closed && replacement_closed {
2921            TokenType::Substitution
2922        } else {
2923            TokenType::Error(Arc::from(format!(
2924                "unclosed quote-like operator 's' delimiter '{}'",
2925                delimiter
2926            )))
2927        };
2928
2929        Some(Token { token_type, text: Arc::from(text), start, end: self.position })
2930    }
2931
2932    fn skip_paired_substitution_replacement_gap(&mut self) {
2933        let mut comment_eligible = false;
2934        loop {
2935            let mut saw_whitespace = false;
2936            while self.current_char().is_some_and(char::is_whitespace) {
2937                self.advance();
2938                saw_whitespace = true;
2939            }
2940            comment_eligible |= saw_whitespace;
2941
2942            if comment_eligible && self.current_char() == Some('#') {
2943                while let Some(ch) = self.current_char() {
2944                    self.advance();
2945                    if matches!(ch, '\n' | '\r') {
2946                        break;
2947                    }
2948                }
2949                comment_eligible = true;
2950                continue;
2951            }
2952
2953            break;
2954        }
2955    }
2956
2957    fn read_substitution_replacement_body(&mut self, delim: char) -> (String, bool) {
2958        if quote_handler::paired_close(delim).is_some() {
2959            return self.read_delimited_body(delim);
2960        }
2961
2962        self.read_unpaired_substitution_replacement_body(delim)
2963    }
2964
2965    fn read_unpaired_substitution_replacement_body(&mut self, delim: char) -> (String, bool) {
2966        let mut body = String::new();
2967        let mut escaped = false;
2968
2969        while let Some(ch) = self.current_char() {
2970            if escaped {
2971                body.push(ch);
2972                self.advance();
2973                escaped = false;
2974                continue;
2975            }
2976
2977            match ch {
2978                '\\' => {
2979                    body.push(ch);
2980                    self.advance();
2981                    escaped = true;
2982                }
2983                '"' | '\'' if ch != delim => {
2984                    if let Some((string_end, true)) =
2985                        self.scan_inner_string_for_delimiter(self.position, ch, delim)
2986                    {
2987                        if let Some(string_text) = self.input.get(self.position..string_end) {
2988                            body.push_str(string_text);
2989                            self.position = string_end;
2990                        } else {
2991                            body.push(ch);
2992                            self.advance();
2993                        }
2994                    } else {
2995                        body.push(ch);
2996                        self.advance();
2997                    }
2998                }
2999                c if c == delim => {
3000                    self.advance();
3001                    return (body, true);
3002                }
3003                _ => {
3004                    body.push(ch);
3005                    self.advance();
3006                }
3007            }
3008        }
3009
3010        (body, false)
3011    }
3012
3013    fn scan_inner_string_for_delimiter(
3014        &self,
3015        start: usize,
3016        quote: char,
3017        delim: char,
3018    ) -> Option<(usize, bool)> {
3019        let mut pos = start.checked_add(quote.len_utf8())?;
3020        let mut escaped = false;
3021        let mut contains_delim = false;
3022
3023        while let Some(ch) = self.input.get(pos..).and_then(|text| text.chars().next()) {
3024            if matches!(ch, '\n' | '\r') {
3025                return None;
3026            }
3027
3028            if escaped {
3029                if ch == delim {
3030                    contains_delim = true;
3031                }
3032                pos += ch.len_utf8();
3033                escaped = false;
3034                continue;
3035            }
3036
3037            match ch {
3038                '\\' => {
3039                    pos += ch.len_utf8();
3040                    escaped = true;
3041                }
3042                c if c == quote => {
3043                    return Some((pos + ch.len_utf8(), contains_delim));
3044                }
3045                c if c == delim => {
3046                    contains_delim = true;
3047                    pos += ch.len_utf8();
3048                }
3049                _ => {
3050                    pos += ch.len_utf8();
3051                }
3052            }
3053        }
3054
3055        None
3056    }
3057
3058    fn parse_transliteration(&mut self, start: usize) -> Option<Token> {
3059        // We've already consumed 'tr' or 'y'
3060        while self.current_char().is_some_and(char::is_whitespace) {
3061            self.advance();
3062        }
3063
3064        let delimiter = self.current_char()?;
3065        self.advance(); // Skip delimiter
3066        self.parse_transliteration_with_delimiter(start, delimiter)
3067    }
3068
3069    fn parse_transliteration_with_delimiter(
3070        &mut self,
3071        start: usize,
3072        delimiter: char,
3073    ) -> Option<Token> {
3074        let (_search, search_closed) = self.read_delimited_body(delimiter);
3075        let replacement_closed;
3076
3077        let search_is_paired = quote_handler::paired_close(delimiter).is_some();
3078        if search_is_paired {
3079            while self.current_char().is_some_and(char::is_whitespace) {
3080                self.advance();
3081            }
3082
3083            if let Some(repl_delim) = self.current_char()
3084                && Self::is_quote_delim(repl_delim)
3085            {
3086                self.advance();
3087                let (_replacement, closed) = self.read_delimited_body(repl_delim);
3088                replacement_closed = closed;
3089            } else {
3090                replacement_closed = false;
3091            }
3092        } else {
3093            let (_replacement, closed) = self.read_delimited_body(delimiter);
3094            replacement_closed = closed;
3095        }
3096
3097        // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
3098        while let Some(ch) = self.current_char() {
3099            if ch.is_ascii_alphanumeric() {
3100                self.advance();
3101            } else {
3102                break;
3103            }
3104        }
3105
3106        let text = &self.input[start..self.position];
3107        self.mode = LexerMode::ExpectOperator;
3108
3109        let token_type = if search_closed && replacement_closed {
3110            TokenType::Transliteration
3111        } else {
3112            TokenType::Error(Arc::from(format!(
3113                "unclosed quote-like operator '{}' delimiter '{}'",
3114                if self.input[start..].starts_with("tr") { "tr" } else { "y" },
3115                delimiter
3116            )))
3117        };
3118
3119        Some(Token { token_type, text: Arc::from(text), start, end: self.position })
3120    }
3121
3122    /// Read content between delimiters.
3123    ///
3124    /// Returns `(body, closed)` where `closed` is `true` if the closing
3125    /// delimiter was found before EOF, and `false` if EOF was reached first.
3126    fn read_delimited_body(&mut self, delim: char) -> (String, bool) {
3127        let paired = quote_handler::paired_close(delim);
3128        let close = paired.unwrap_or(delim);
3129        let mut body = String::new();
3130        let mut depth = i32::from(paired.is_some());
3131
3132        while let Some(ch) = self.current_char() {
3133            if ch == '\\' {
3134                body.push(ch);
3135                self.advance();
3136                if let Some(next) = self.current_char() {
3137                    body.push(next);
3138                    self.advance();
3139                }
3140                continue;
3141            }
3142
3143            if paired.is_some() && ch == delim {
3144                body.push(ch);
3145                self.advance();
3146                depth += 1;
3147                continue;
3148            }
3149
3150            if ch == close {
3151                if paired.is_some() {
3152                    depth -= 1;
3153                    if depth == 0 {
3154                        self.advance();
3155                        return (body, true);
3156                    }
3157                    body.push(ch);
3158                    self.advance();
3159                } else {
3160                    self.advance();
3161                    return (body, true);
3162                }
3163                continue;
3164            }
3165
3166            body.push(ch);
3167            self.advance();
3168        }
3169
3170        // EOF reached without finding the closing delimiter
3171        (body, false)
3172    }
3173
3174    /// Parse a quote operator after we've seen the delimiter
3175    fn parse_quote_operator(&mut self, delimiter: char) -> Option<Token> {
3176        let info = self.current_quote_op.as_ref()?;
3177        let start = info.start_pos;
3178        let operator = info.operator.clone();
3179
3180        // Clear the quote-op context eagerly so any early-return path (s/tr/y delegations
3181        // below) does not leave a stale reference behind. The post-match cleanup at the
3182        // bottom of this function would otherwise be skipped for those operators.
3183        self.current_quote_op = None;
3184
3185        // Parse based on operator type; track whether all delimiters were closed.
3186        let closed = match operator.as_str() {
3187            "s" => {
3188                return self.parse_substitution_with_delimiter(start, delimiter);
3189            }
3190            "tr" | "y" => {
3191                return self.parse_transliteration_with_delimiter(start, delimiter);
3192            }
3193            "qr" => {
3194                let (_pattern, body_closed) = self.read_delimited_body(delimiter);
3195                self.parse_regex_modifiers(&quote_handler::QR_SPEC);
3196                body_closed
3197            }
3198            "m" => {
3199                let (_pattern, body_closed) = self.read_delimited_body(delimiter);
3200                self.parse_regex_modifiers(&quote_handler::M_SPEC);
3201                body_closed
3202            }
3203            _ => {
3204                // q, qq, qw, qx - no modifiers
3205                let (_body, body_closed) = self.read_delimited_body(delimiter);
3206                body_closed
3207            }
3208        };
3209
3210        let text = &self.input[start..self.position];
3211
3212        self.mode = LexerMode::ExpectOperator;
3213
3214        if !closed {
3215            // EOF reached before finding the closing delimiter — emit an error
3216            // token so the parser's recovery mechanism records a diagnostic.
3217            return Some(Token {
3218                token_type: TokenType::Error(Arc::from(format!(
3219                    "unclosed {} delimiter '{}'",
3220                    operator, delimiter
3221                ))),
3222                text: Arc::from(text),
3223                start,
3224                end: self.position,
3225            });
3226        }
3227
3228        let token_type = quote_handler::get_quote_token_type(&operator);
3229        Some(Token { token_type, text: Arc::from(text), start, end: self.position })
3230    }
3231
3232    /// Parse regex modifiers according to the given spec
3233    ///
3234    /// This function includes ALL characters that could be intended as modifiers,
3235    /// including invalid ones. This allows the parser to properly reject invalid
3236    /// modifiers with a clear error message, rather than leaving them as separate
3237    /// tokens that could be confusingly parsed.
3238    fn parse_regex_modifiers(&mut self, _spec: &quote_handler::ModSpec) {
3239        // Consume all alphanumeric characters that could be intended as modifiers
3240        // The parser will validate and reject invalid ones
3241        while let Some(ch) = self.current_char() {
3242            if ch.is_ascii_alphanumeric() {
3243                self.advance();
3244            } else {
3245                break;
3246            }
3247        }
3248        // Note: We no longer validate here - the parser will validate and provide
3249        // clear error messages for invalid modifiers (MUT_005 fix)
3250    }
3251
3252    /// Parse a regex literal starting with `/`
3253    ///
3254    /// **Budget Protection (Issue #422)**:
3255    /// - Budget guards prevent runaway scanning on pathological input
3256    /// - `MAX_REGEX_PARSE_STEPS` bounds literal scanning before the byte budget
3257    /// - `MAX_REGEX_BYTES` bounds total bytes consumed in a single regex literal
3258    /// - Graceful degradation: emit UnknownRest token if budget exceeded
3259    ///
3260    /// **Performance**:
3261    /// - Single-pass scanning with escape handling
3262    /// - Budget check per iteration (amortized O(1) via inline fast path)
3263    /// - Typical regex: <10μs, Large regex (64KB): ~1ms
3264    fn parse_regex(&mut self, start: usize) -> Option<Token> {
3265        self.advance(); // Skip opening /
3266
3267        let mut regex_parse_steps: usize = 0;
3268        let mut in_character_class = false;
3269
3270        while let Some(ch) = self.current_char() {
3271            regex_parse_steps += 1;
3272            if regex_parse_steps > MAX_REGEX_PARSE_STEPS {
3273                #[cfg(debug_assertions)]
3274                {
3275                    let text = &self.input[start..self.position];
3276                    let preview = truncate_preview(text, 50);
3277                    tracing::debug!(
3278                        limit = MAX_REGEX_PARSE_STEPS,
3279                        pattern_preview = %preview,
3280                        "Regex parse step budget exceeded"
3281                    );
3282                }
3283                self.position = self.input.len();
3284                return Some(Token {
3285                    token_type: TokenType::UnknownRest,
3286                    text: empty_arc(),
3287                    start,
3288                    end: self.position,
3289                });
3290            }
3291
3292            // Budget guard: prevent timeout on pathological input (Issue #422)
3293            // If exceeded, returns UnknownRest token for graceful degradation
3294            if let Some(token) = self.budget_guard(start, 0) {
3295                return Some(token);
3296            }
3297
3298            match ch {
3299                '/' if !in_character_class => {
3300                    self.advance();
3301                    // Parse flags - include all alphanumeric for proper validation in parser (MUT_005 fix)
3302                    while let Some(ch) = self.current_char() {
3303                        if ch.is_ascii_alphanumeric() {
3304                            self.advance();
3305                        } else {
3306                            break;
3307                        }
3308                    }
3309
3310                    let text = &self.input[start..self.position];
3311                    self.mode = LexerMode::ExpectOperator;
3312
3313                    return Some(Token {
3314                        token_type: TokenType::RegexMatch,
3315                        text: Arc::from(text),
3316                        start,
3317                        end: self.position,
3318                    });
3319                }
3320                '\\' => {
3321                    // Handle escape sequences: consume backslash + next char
3322                    self.advance();
3323                    if self.current_char().is_some() {
3324                        self.advance();
3325                    }
3326                }
3327                '[' => {
3328                    in_character_class = true;
3329                    self.advance();
3330                }
3331                ']' if in_character_class => {
3332                    in_character_class = false;
3333                    self.advance();
3334                }
3335                _ => self.advance(),
3336            }
3337        }
3338
3339        // Unterminated regex - EOF reached before closing /
3340        // Parser will emit diagnostic for unterminated literal
3341        None
3342    }
3343}
3344
3345// Pre-allocated empty Arc to avoid repeated allocations
3346static EMPTY_ARC: OnceLock<Arc<str>> = OnceLock::new();
3347
3348#[inline(always)]
3349fn empty_arc() -> Arc<str> {
3350    EMPTY_ARC.get_or_init(|| Arc::from("")).clone()
3351}
3352
3353fn truncate_preview(text: &str, max_chars: usize) -> String {
3354    match text.char_indices().nth(max_chars) {
3355        Some((idx, _)) => format!("{}...", &text[..idx]),
3356        None => text.to_string(),
3357    }
3358}
3359
3360#[inline(always)]
3361fn is_keyword_fast(word: &str) -> bool {
3362    // Fast length-based rejection for most cases.
3363    // Lexer keywords are currently bounded to 1..=9 characters.
3364    matches!(word.len(), 1..=9) && is_lexer_keyword(word)
3365}
3366
3367#[inline]
3368fn is_builtin_function(word: &str) -> bool {
3369    BARE_TERM_BUILTINS.binary_search(&word).is_ok()
3370}
3371
3372#[inline(always)]
3373fn is_quote_op_word_prefix(word: &[u8]) -> bool {
3374    matches!(word, b"m" | b"q" | b"qq" | b"qw" | b"qx" | b"qr")
3375}
3376
3377const BARE_TERM_BUILTINS: &[&str] = &[
3378    "abs", "chomp", "chop", "chr", "close", "defined", "delete", "each", "exists", "hex", "int",
3379    "join", "keys", "lc", "lcfirst", "length", "oct", "open", "ord", "pack", "print", "push",
3380    "read", "ref", "reverse", "rindex", "say", "scalar", "splice", "sprintf", "sqrt", "substr",
3381    "tie", "uc", "ucfirst", "unpack", "unshift", "untie", "values", "write",
3382];
3383
3384/// Fast lookup table for compound operator second characters
3385const COMPOUND_SECOND_CHARS: &[u8] = b"=<>&|+->.~*:";
3386
3387#[inline]
3388fn is_compound_operator(first: char, second: char) -> bool {
3389    // Optimized compound operator lookup using perfect hashing for common cases
3390    // Convert to bytes for faster comparison (most operators are ASCII)
3391    if first.is_ascii() && second.is_ascii() {
3392        let first_byte = first as u8;
3393        let second_byte = second as u8;
3394
3395        if !COMPOUND_SECOND_CHARS.contains(&second_byte) {
3396            return false;
3397        }
3398
3399        // Use lookup table approach for maximum performance
3400        match (first_byte, second_byte) {
3401            // Assignment operators
3402            (b'+' | b'-' | b'*' | b'/' | b'%' | b'&' | b'|' | b'^' | b'.', b'=') => true,
3403
3404            // Comparison operators
3405            (b'<' | b'>' | b'=' | b'!', b'=') => true,
3406
3407            // Pattern operators
3408            (b'=' | b'!', b'~') => true,
3409
3410            // Increment/decrement
3411            (b'+', b'+') | (b'-', b'-') => true,
3412
3413            // Logical operators
3414            (b'&', b'&') | (b'|', b'|') => true,
3415
3416            // Shift operators
3417            (b'<', b'<') | (b'>', b'>') => true,
3418
3419            // Other compound operators
3420            (b'*', b'*')
3421            | (b'/', b'/')
3422            | (b'-' | b'=', b'>')
3423            | (b'.', b'.')
3424            | (b'~', b'~')
3425            | (b':', b':') => true,
3426
3427            _ => false,
3428        }
3429    } else {
3430        // Fallback for non-ASCII (should be rare)
3431        matches!(
3432            (first, second),
3433            ('+' | '-' | '*' | '/' | '%' | '&' | '|' | '^' | '.' | '<' | '>' | '=' | '!', '=')
3434                | ('=' | '!' | '~', '~')
3435                | ('+', '+')
3436                | ('-', '-' | '>')
3437                | ('&', '&')
3438                | ('|', '|')
3439                | ('<', '<')
3440                | ('>' | '=', '>')
3441                | ('*', '*')
3442                | ('/', '/')
3443                | ('.', '.')
3444                | (':', ':')
3445        )
3446    }
3447}
3448
3449// Checkpoint support for incremental parsing
3450
3451mod checkpoint_impl;
3452
3453#[cfg(test)]
3454mod test_format_debug;
3455#[cfg(test)]
3456mod tests;
perl_lexer/lib.rs

perl_lexer/
lib.rs