perl_lexer/
lib.rs

1//! Context-aware Perl lexer with mode-based tokenization
2//!
3//! This crate provides a high-performance lexer for Perl that handles the inherently
4//! context-sensitive nature of the language. The lexer uses a mode-tracking system to
5//! correctly disambiguate ambiguous syntax like `/` (division vs. regex) and properly
6//! parse complex constructs like heredocs, quote-like operators, and nested delimiters.
7//!
8//! # Architecture
9//!
10//! The lexer is organized around several key concepts:
11//!
12//! - **Mode Tracking**: [`LexerMode`] tracks whether the parser expects a term or an operator,
13//!   enabling correct disambiguation of context-sensitive tokens.
14//! - **Checkpointing**: [`LexerCheckpoint`] and [`Checkpointable`] support incremental parsing
15//!   by allowing the lexer state to be saved and restored.
16//! - **Budget Limits**: Protection against pathological input with configurable size limits
17//!   for regex patterns, heredoc bodies, and delimiter nesting depth.
18//! - **Position Tracking**: [`Position`] maintains line/column information for error reporting
19//!   and LSP integration.
20//! - **Unicode Support**: Full Unicode identifier support following Perl 5.14+ semantics.
21//!
22//! # Usage
23//!
24//! ## Basic Tokenization
25//!
26//! ```rust
27//! use perl_lexer::{PerlLexer, TokenType};
28//!
29//! let mut lexer = PerlLexer::new("my $x = 42;");
30//! let tokens = lexer.collect_tokens();
31//!
32//! // First token is the keyword `my`
33//! assert!(matches!(&tokens[0].token_type, TokenType::Keyword(k) if &**k == "my"));
34//! // Tokens include variables, operators, literals, and EOF
35//! assert!(matches!(&tokens.last().map(|t| &t.token_type), Some(TokenType::EOF)));
36//! ```
37//!
38//! ## Context-Aware Parsing
39//!
40//! The lexer automatically tracks context to disambiguate operators:
41//!
42//! ```rust
43//! use perl_lexer::{PerlLexer, TokenType};
44//!
45//! // Division operator (after a term)
46//! let mut lexer = PerlLexer::new("42 / 2");
47//! // Regex operator (at start of expression)
48//! let mut lexer2 = PerlLexer::new("/pattern/");
49//! ```
50//!
51//! ## Checkpointing for Incremental Parsing
52//!
53//! ```rust,ignore
54//! use perl_lexer::{PerlLexer, Checkpointable};
55//!
56//! let mut lexer = PerlLexer::new("my $x = 1;");
57//! let checkpoint = lexer.checkpoint();
58//!
59//! // Parse some tokens
60//! let _ = lexer.next_token();
61//!
62//! // Restore to checkpoint
63//! lexer.restore(&checkpoint);
64//! ```
65//!
66//! ## Configuration Options
67//!
68//! ```rust
69//! use perl_lexer::{PerlLexer, LexerConfig};
70//!
71//! let config = LexerConfig {
72//!     parse_interpolation: true,  // Parse string interpolation
73//!     track_positions: true,      // Track line/column positions
74//!     max_lookahead: 1024,        // Maximum lookahead for disambiguation
75//! };
76//!
77//! let mut lexer = PerlLexer::with_config("my $x = 1;", config);
78//! ```
79//!
80//! # Context Sensitivity Examples
81//!
82//! Perl's grammar is highly context-sensitive. The lexer handles these cases:
83//!
84//! - **Division vs. Regex**: `/` is division after terms, regex at expression start
85//! - **Modulo vs. Hash Sigil**: `%` is modulo after terms, hash sigil at expression start
86//! - **Glob vs. Exponent**: `**` can be exponentiation or glob pattern start
87//! - **Defined-or vs. Regex**: `//` is defined-or after terms, regex at expression start
88//! - **Heredoc Markers**: `<<` can be left shift, here-doc, or numeric less-than-less-than
89//!
90//! # Budget Limits
91//!
92//! To prevent hangs on pathological input, the lexer enforces these limits:
93//!
94//! - **MAX_REGEX_BYTES**: 64KB maximum for regex patterns
95//! - **MAX_HEREDOC_BYTES**: 256KB maximum for heredoc bodies
96//! - **MAX_DELIM_NEST**: 128 levels maximum nesting depth for delimiters
97//! - **MAX_REGEX_PARSE_STEPS**: 32K maximum scan iterations for regex literals
98//!
99//! When limits are exceeded, the lexer emits an `UnknownRest` token preserving
100//! all previously parsed symbols, allowing continued analysis.
101//!
102//! # Integration with perl-parser
103//!
104//! The lexer is designed to work seamlessly with `perl_parser_core::Parser`.
105//! You rarely need to use the lexer directly -- the parser creates and manages
106//! a `PerlLexer` instance internally:
107//!
108//! ```rust,ignore
109//! use perl_parser_core::Parser;
110//!
111//! let code = r#"sub hello { print "Hello, world!\n"; }"#;
112//! let mut parser = Parser::new(code);
113//! let ast = parser.parse().expect("should parse");
114//! ```
115
116#![warn(clippy::all)]
117#![allow(
118    // Core allows for lexer code
119    clippy::too_many_lines,
120    clippy::module_name_repetitions,
121    clippy::cast_possible_truncation,
122    clippy::cast_sign_loss,
123    clippy::cast_possible_wrap,
124    clippy::cast_precision_loss,
125    clippy::must_use_candidate,
126    clippy::missing_errors_doc,
127    clippy::missing_panics_doc,
128
129    // Lexer-specific patterns that are fine
130    clippy::match_same_arms,
131    clippy::redundant_else,
132    clippy::unnecessary_wraps,
133    clippy::unused_self,
134    clippy::items_after_statements,
135    clippy::struct_excessive_bools,
136    clippy::uninlined_format_args
137)]
138
139use std::sync::{Arc, OnceLock};
140
141pub mod api;
142pub mod builtins;
143pub mod checkpoint;
144pub mod config;
145pub mod error;
146mod heredoc;
147pub mod keywords;
148mod lexer;
149pub mod limits;
150pub mod mode;
151mod quote_handler;
152pub mod token;
153pub mod tokenizer;
154mod unicode;
155
156pub use api::*;
157pub use checkpoint::{CheckpointCache, Checkpointable, LexerCheckpoint};
158pub use config::LexerConfig;
159pub use error::{LexerError, Result};
160pub use lexer::PerlLexer;
161pub use limits::MAX_REGEX_PARSE_STEPS;
162pub use mode::LexerMode;
163pub use perl_position_tracking::Position;
164pub use token::{StringPart, Token, TokenType};
165
166use unicode::{is_perl_identifier_continue, is_perl_identifier_start};
167
168use crate::heredoc::HeredocSpec;
169use crate::limits::{
170    HEREDOC_TIMEOUT_MS, MAX_DELIM_NEST, MAX_HEREDOC_BYTES, MAX_HEREDOC_DEPTH, MAX_REGEX_BYTES,
171};
172
173impl<'a> PerlLexer<'a> {
174    /// Create a new lexer that emits `HeredocBody` tokens (for LSP folding)
175    pub fn with_body_tokens(input: &'a str) -> Self {
176        let mut lexer = Self::new(input);
177        lexer.emit_heredoc_body_tokens = true;
178        lexer
179    }
180
181    /// Set the lexer mode (for resetting state at statement boundaries)
182    pub fn set_mode(&mut self, mode: LexerMode) {
183        self.mode = mode;
184    }
185
186    /// Advance the lexer and return the next token.
187    ///
188    /// Returns `None` only after an `EOF` token has already been emitted.
189    /// The final meaningful call returns `Some(Token { token_type: TokenType::EOF, .. })`.
190    pub fn next_token(&mut self) -> Option<Token> {
191        // Normalize file start (BOM) once
192        if self.position == 0 {
193            self.normalize_file_start();
194        }
195
196        // Loop to avoid recursion when processing heredocs
197        loop {
198            // Handle format body parsing if we're in that mode
199            if matches!(self.mode, LexerMode::InFormatBody) {
200                return self.parse_format_body();
201            }
202
203            // Handle data section parsing if we're in that mode
204            if matches!(self.mode, LexerMode::InDataSection) {
205                return self.parse_data_body();
206            }
207
208            // Check if we're inside a heredoc body BEFORE skipping whitespace
209            let mut found_terminator = false;
210            if !self.pending_heredocs.is_empty() {
211                // Clone what we need to avoid holding a borrow
212                let (body_start, label, allow_indent) =
213                    if let Some(spec) = self.pending_heredocs.first() {
214                        if spec.body_start > 0
215                            && self.position >= spec.body_start
216                            && self.position < self.input.len()
217                        {
218                            (spec.body_start, spec.label.clone(), spec.allow_indent)
219                        } else {
220                            // Not in a heredoc body yet or at EOF
221                            (0, empty_arc(), false)
222                        }
223                    } else {
224                        (0, empty_arc(), false)
225                    };
226
227                if body_start > 0 {
228                    // We're inside a heredoc body - scan for the terminator
229
230                    // Scan line by line looking for the terminator
231                    while self.position < self.input.len() {
232                        // Timeout protection (Issue #443)
233                        if self.start_time.elapsed().as_millis() > HEREDOC_TIMEOUT_MS as u128 {
234                            self.pending_heredocs.remove(0);
235                            self.position = self.input.len();
236                            return Some(Token {
237                                token_type: TokenType::Error(Arc::from("Heredoc parsing timeout")),
238                                text: Arc::from(&self.input[body_start..]),
239                                start: body_start,
240                                end: self.input.len(),
241                            });
242                        }
243
244                        // Budget cap for huge bodies - optimized check
245                        if self.position - body_start > MAX_HEREDOC_BYTES {
246                            // Remove the pending heredoc to avoid infinite loop
247                            self.pending_heredocs.remove(0);
248                            self.position = self.input.len();
249                            return Some(Token {
250                                token_type: TokenType::UnknownRest,
251                                text: Arc::from(&self.input[body_start..]),
252                                start: body_start,
253                                end: self.input.len(),
254                            });
255                        }
256
257                        // Skip to start of next line if not at line start
258                        // Exception: if we're at body_start exactly, we're at the heredoc body start
259                        if !self.after_newline && self.position != body_start {
260                            while self.position < self.input.len()
261                                && self.input_bytes[self.position] != b'\n'
262                                && self.input_bytes[self.position] != b'\r'
263                            {
264                                self.advance();
265                            }
266                            self.consume_newline();
267                            continue;
268                        }
269
270                        // We're at line start - check if this line is the terminator
271                        let line_start = self.position;
272                        let (line_end, line_visible_end) =
273                            Self::find_line_end(self.input_bytes, self.position);
274                        let line = &self.input[line_start..line_visible_end];
275                        // Strip trailing spaces/tabs (Perl allows them)
276                        let trimmed_end = line.trim_end_matches([' ', '\t']);
277
278                        // Check if this line is the terminator
279                        let is_terminator = if allow_indent {
280                            // Allow any leading spaces/tabs before the label
281                            let mut p = 0;
282                            while p < trimmed_end.len() {
283                                let b = trimmed_end.as_bytes()[p];
284                                if b == b' ' || b == b'\t' {
285                                    p += 1;
286                                } else {
287                                    break;
288                                }
289                            }
290                            trimmed_end[p..] == *label
291                        } else {
292                            // Must start at column 0 (no leading whitespace)
293                            // The terminator is just the label (already trimmed trailing whitespace)
294                            trimmed_end == &*label
295                        };
296
297                        if is_terminator {
298                            // Found the terminator!
299                            self.pending_heredocs.remove(0);
300                            found_terminator = true;
301
302                            // Consume past the terminator line
303                            self.position = line_end;
304                            self.consume_newline();
305
306                            // Set body_start for the next pending heredoc (if any)
307                            if let Some(next) = self.pending_heredocs.first_mut()
308                                && next.body_start == 0
309                            {
310                                next.body_start = self.position;
311                            }
312
313                            // Only emit HeredocBody if requested (for folding)
314                            if self.emit_heredoc_body_tokens {
315                                return Some(Token {
316                                    token_type: TokenType::HeredocBody(empty_arc()),
317                                    text: empty_arc(),
318                                    start: body_start,
319                                    end: line_start,
320                                });
321                            }
322                            // Otherwise, continue the outer loop to get the next real token (avoiding recursion)
323                            break; // Break inner while loop, continue outer loop
324                        }
325
326                        // Not the terminator, continue to next line
327                        self.position = line_end;
328                        self.consume_newline();
329                    }
330
331                    // If we didn't find a terminator, we reached EOF - emit error token
332                    if !found_terminator {
333                        // Remove the pending heredoc to avoid infinite loop
334                        self.pending_heredocs.remove(0);
335                        self.position = self.input.len();
336                        return Some(Token {
337                            token_type: TokenType::UnknownRest,
338                            text: Arc::from(&self.input[body_start..]),
339                            start: body_start,
340                            end: self.input.len(),
341                        });
342                    }
343                }
344
345                // If we found a terminator, continue outer loop to get next token
346                if found_terminator {
347                    continue; // Continue outer loop to get next token
348                }
349            }
350
351            self.skip_whitespace_and_comments()?;
352
353            // Check again if we're now in a heredoc body (might have been set during skip_whitespace)
354            if !self.pending_heredocs.is_empty()
355                && let Some(spec) = self.pending_heredocs.first()
356                && spec.body_start > 0
357                && self.position >= spec.body_start
358                && self.position < self.input.len()
359            {
360                continue; // Go back to top of loop to process heredoc
361            }
362
363            // If we reach EOF with pending heredocs, clear them and emit EOF
364            if self.position >= self.input.len() && !self.pending_heredocs.is_empty() {
365                self.pending_heredocs.clear();
366            }
367
368            if self.position >= self.input.len() {
369                if self.eof_emitted {
370                    return None; // Stop the stream
371                }
372                self.eof_emitted = true;
373                return Some(Token {
374                    token_type: TokenType::EOF,
375                    text: empty_arc(),
376                    start: self.position,
377                    end: self.position,
378                });
379            }
380
381            let start = self.position;
382
383            // Check for special tokens first
384            if let Some(token) = self.try_heredoc() {
385                return Some(token);
386            }
387
388            if let Some(token) = self.try_string() {
389                return Some(token);
390            }
391
392            if let Some(token) = self.try_variable() {
393                return Some(token);
394            }
395
396            if let Some(token) = self.try_number() {
397                return Some(token);
398            }
399
400            if let Some(token) = self.try_vstring() {
401                return Some(token);
402            }
403
404            if let Some(token) = self.try_identifier_or_keyword() {
405                return Some(token);
406            }
407
408            // If we're expecting a delimiter for a quote operator, only try delimiter
409            if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
410                if let Some(token) = self.try_delimiter() {
411                    return Some(token);
412                }
413                // Do NOT fall through to try_operator / try_punct / etc.
414                // Clear state first so we don't spin
415                self.mode = LexerMode::ExpectOperator;
416                self.current_quote_op = None;
417                continue;
418            }
419
420            if let Some(token) = self.try_operator() {
421                return Some(token);
422            }
423
424            if let Some(token) = self.try_delimiter() {
425                return Some(token);
426            }
427
428            // If nothing else matches, return an error token
429            let ch = self.current_char()?;
430            self.advance();
431
432            // Optimize error token creation - avoid expensive formatting in hot path
433            let text = if ch.is_ascii() {
434                // Fast path for ASCII characters
435                Arc::from(&self.input[start..self.position])
436            } else {
437                // Unicode path without intermediate heap allocation
438                let mut buf = [0_u8; 4];
439                Arc::from(ch.encode_utf8(&mut buf))
440            };
441
442            return Some(Token {
443                token_type: TokenType::Error(Arc::from("Unexpected character")),
444                text,
445                start,
446                end: self.position,
447            });
448        } // End of loop
449    }
450
451    /// Budget guard to prevent infinite loops and timeouts (Issue #422)
452    ///
453    /// **Purpose**: Protect against pathological input that could cause:
454    /// - Infinite loops in regex/heredoc parsing
455    /// - Excessive memory consumption
456    /// - LSP server hangs
457    ///
458    /// **Limits**:
459    /// - `MAX_REGEX_BYTES` (64KB): Maximum bytes in a single regex literal
460    /// - `MAX_DELIM_NEST` (128): Maximum delimiter nesting depth
461    ///
462    /// **Graceful Degradation**:
463    /// - Budget exceeded → emit `UnknownRest` token
464    /// - Jump to EOF to prevent further parsing of problematic region
465    /// - LSP client can emit soft diagnostic about truncation
466    /// - All previously parsed symbols remain valid
467    ///
468    /// **Performance**:
469    /// - Fast path: inlined subtraction + comparison (~1-2 CPU cycles)
470    /// - Slow path: Only triggered on pathological input
471    /// - Amortized cost: O(1) per token
472    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
473    #[inline(always)]
474    fn budget_guard(&mut self, start: usize, depth: usize) -> Option<Token> {
475        // Fast path: most calls won't hit limits
476        let bytes_consumed = self.position - start;
477        if bytes_consumed <= MAX_REGEX_BYTES && depth <= MAX_DELIM_NEST {
478            return None;
479        }
480
481        // Slow path: budget exceeded - graceful degradation
482        #[cfg(debug_assertions)]
483        {
484            tracing::debug!(
485                bytes_consumed,
486                depth,
487                position = self.position,
488                "Lexer budget exceeded"
489            );
490        }
491
492        self.position = self.input.len();
493        Some(Token {
494            token_type: TokenType::UnknownRest,
495            text: Arc::from(""),
496            start,
497            end: self.position,
498        })
499    }
500
501    /// Peek at the next token without consuming it.
502    ///
503    /// Saves and restores the full lexer state so the next call to
504    /// [`next_token`](Self::next_token) returns the same token.
505    pub fn peek_token(&mut self) -> Option<Token> {
506        let saved_pos = self.position;
507        let saved_mode = self.mode;
508        let saved_delimiter_stack = self.delimiter_stack.clone();
509        let saved_prototype = self.in_prototype;
510        let saved_depth = self.prototype_depth;
511        let saved_after_sub = self.after_sub;
512        let saved_after_arrow = self.after_arrow;
513        let saved_hash_brace_depth = self.hash_brace_depth;
514        let saved_after_var_subscript = self.after_var_subscript;
515        let saved_paren_depth = self.paren_depth;
516        let saved_current_pos = self.current_pos;
517        let saved_after_newline = self.after_newline;
518        let saved_pending_heredocs = self.pending_heredocs.clone();
519        let saved_line_start_offset = self.line_start_offset;
520        let saved_current_quote_op = self.current_quote_op.clone();
521        let saved_eof_emitted = self.eof_emitted;
522        let saved_start_time = self.start_time;
523
524        let token = self.next_token();
525
526        self.position = saved_pos;
527        self.mode = saved_mode;
528        self.delimiter_stack = saved_delimiter_stack;
529        self.in_prototype = saved_prototype;
530        self.prototype_depth = saved_depth;
531        self.after_sub = saved_after_sub;
532        self.after_arrow = saved_after_arrow;
533        self.hash_brace_depth = saved_hash_brace_depth;
534        self.after_var_subscript = saved_after_var_subscript;
535        self.paren_depth = saved_paren_depth;
536        self.current_pos = saved_current_pos;
537        self.after_newline = saved_after_newline;
538        self.pending_heredocs = saved_pending_heredocs;
539        self.line_start_offset = saved_line_start_offset;
540        self.current_quote_op = saved_current_quote_op;
541        self.eof_emitted = saved_eof_emitted;
542        self.start_time = saved_start_time;
543
544        token
545    }
546
547    /// Consume all remaining tokens and return them as a vector.
548    ///
549    /// The returned vector always ends with an `EOF` token.
550    pub fn collect_tokens(&mut self) -> Vec<Token> {
551        let mut tokens = Vec::new();
552        while let Some(token) = self.next_token() {
553            if token.token_type == TokenType::EOF {
554                tokens.push(token);
555                break;
556            }
557            tokens.push(token);
558        }
559        tokens
560    }
561
562    /// Reset the lexer to the beginning of the input.
563    ///
564    /// Clears all internal state (mode, delimiter stack, heredoc queue, etc.)
565    /// so the lexer can re-tokenize the same source from scratch.
566    pub fn reset(&mut self) {
567        self.position = 0;
568        self.mode = LexerMode::ExpectTerm;
569        self.delimiter_stack.clear();
570        self.in_prototype = false;
571        self.prototype_depth = 0;
572        self.after_sub = false;
573        self.after_arrow = false;
574        self.hash_brace_depth = 0;
575        self.after_var_subscript = false;
576        self.paren_depth = 0;
577        self.current_pos = Position::start();
578        self.after_newline = true;
579        self.pending_heredocs.clear();
580        self.line_start_offset = 0;
581        self.current_quote_op = None;
582        self.eof_emitted = false;
583        self.start_time = std::time::Instant::now();
584    }
585
586    /// Switch the lexer into format-body parsing mode.
587    ///
588    /// In this mode the lexer consumes input verbatim until it encounters a
589    /// line containing only `.` (the Perl format terminator).
590    pub fn enter_format_mode(&mut self) {
591        self.mode = LexerMode::InFormatBody;
592    }
593
594    // Internal helper methods
595
596    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
597    #[inline(always)]
598    fn byte_at(bytes: &[u8], index: usize) -> u8 {
599        debug_assert!(index < bytes.len());
600        match bytes.get(index) {
601            Some(&byte) => byte,
602            None => 0,
603        }
604    }
605
606    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
607    #[inline(always)]
608    fn current_char(&self) -> Option<char> {
609        if self.position < self.input_bytes.len() {
610            // For ASCII, direct access is safe
611            let byte = Self::byte_at(self.input_bytes, self.position);
612            if byte < 128 {
613                Some(byte as char)
614            } else {
615                // For non-ASCII, fall back to proper UTF-8 parsing
616                self.input.get(self.position..).and_then(|s| s.chars().next())
617            }
618        } else {
619            None
620        }
621    }
622
623    #[inline(always)]
624    fn peek_char(&self, offset: usize) -> Option<char> {
625        if offset > self.config.max_lookahead {
626            return None;
627        }
628
629        let pos = self.position.checked_add(offset)?;
630        if pos < self.input_bytes.len() {
631            // For ASCII, direct access is safe
632            let byte = Self::byte_at(self.input_bytes, pos);
633            if byte < 128 {
634                Some(byte as char)
635            } else {
636                // For non-ASCII, use chars iterator
637                self.input.get(self.position..).and_then(|s| s.chars().nth(offset))
638            }
639        } else {
640            None
641        }
642    }
643
644    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
645    #[inline(always)]
646    fn advance(&mut self) {
647        if self.position < self.input_bytes.len() {
648            let byte = Self::byte_at(self.input_bytes, self.position);
649            if byte < 128 {
650                // ASCII fast path
651                self.position += 1;
652            } else if let Some(ch) = self.input.get(self.position..).and_then(|s| s.chars().next())
653            {
654                self.position += ch.len_utf8();
655            }
656        }
657    }
658
659    /// General-purpose balanced-segment consumer (no quote-boundary recovery).
660    ///
661    /// For use inside double-quoted string interpolation where the outer `"` must
662    /// act as a recovery boundary, use [`consume_balanced_segment_in_string`] instead.
663    #[allow(dead_code)]
664    #[inline]
665    fn consume_balanced_segment(&mut self, open: char, close: char) -> Option<usize> {
666        if self.current_char() != Some(open) {
667            return None;
668        }
669
670        let mut depth = 1usize;
671        self.advance();
672        while let Some(ch) = self.current_char() {
673            match ch {
674                '\\' => {
675                    self.advance();
676                    if self.current_char().is_some() {
677                        self.advance();
678                    }
679                }
680                c if c == open => {
681                    depth += 1;
682                    self.advance();
683                }
684                c if c == close => {
685                    self.advance();
686                    depth -= 1;
687                    if depth == 0 {
688                        return Some(self.position);
689                    }
690                }
691                _ => self.advance(),
692            }
693        }
694
695        None
696    }
697
698    #[inline]
699    fn consume_balanced_segment_in_string(
700        &mut self,
701        open: char,
702        close: char,
703        terminator: char,
704    ) -> Option<usize> {
705        if self.current_char() != Some(open) {
706            return None;
707        }
708
709        let mut depth = 1usize;
710        self.advance();
711        while let Some(ch) = self.current_char() {
712            match ch {
713                '\\' => {
714                    self.advance();
715                    if self.current_char().is_some() {
716                        self.advance();
717                    }
718                }
719                c if c == terminator => {
720                    // Local recovery for interpolation tails in quoted strings:
721                    // stop at the closing quote so the outer string parser can
722                    // still terminate this token cleanly.
723                    return None;
724                }
725                c if c == open => {
726                    depth += 1;
727                    self.advance();
728                }
729                c if c == close => {
730                    self.advance();
731                    depth -= 1;
732                    if depth == 0 {
733                        return Some(self.position);
734                    }
735                }
736                _ => self.advance(),
737            }
738        }
739
740        None
741    }
742
743    /// Fast byte-level check for ASCII characters
744    #[inline]
745    fn peek_byte(&self, offset: usize) -> Option<u8> {
746        if offset > self.config.max_lookahead {
747            return None;
748        }
749
750        let pos = self.position.checked_add(offset)?;
751        if pos < self.input_bytes.len() { Some(self.input_bytes[pos]) } else { None }
752    }
753
754    /// Check if the next bytes match a pattern (ASCII only)
755    #[inline]
756    fn matches_bytes(&self, pattern: &[u8]) -> bool {
757        let Some(end_offset) = pattern.len().checked_sub(1) else {
758            return true;
759        };
760
761        if end_offset > self.config.max_lookahead {
762            return false;
763        }
764
765        let Some(end) = self.position.checked_add(pattern.len()) else {
766            return false;
767        };
768
769        if end <= self.input_bytes.len() {
770            &self.input_bytes[self.position..end] == pattern
771        } else {
772            false
773        }
774    }
775
776    #[inline]
777    fn skip_whitespace_and_comments(&mut self) -> Option<()> {
778        // Don't reset after_newline if we're at the start of a line
779        if self.position > 0 && self.position != self.line_start_offset {
780            self.after_newline = false;
781        }
782
783        while self.position < self.input_bytes.len() {
784            let byte = Self::byte_at(self.input_bytes, self.position);
785            match byte {
786                // Fast path for ASCII whitespace - batch process
787                b' ' => {
788                    // Batch skip spaces for better cache efficiency
789                    let start = self.position;
790                    while self.position < self.input_bytes.len()
791                        && Self::byte_at(self.input_bytes, self.position) == b' '
792                    {
793                        self.position += 1;
794                    }
795                    // Continue outer loop if we processed any spaces
796                    if self.position > start {
797                        // Loop naturally continues to next iteration
798                    }
799                }
800                b'\t' | 0x0B | 0x0C => {
801                    // Batch skip horizontal tab, vertical tab, and form feed.
802                    // Perl treats these as whitespace separators.
803                    let start = self.position;
804                    while self.position < self.input_bytes.len()
805                        && matches!(
806                            Self::byte_at(self.input_bytes, self.position),
807                            b'\t' | 0x0B | 0x0C
808                        )
809                    {
810                        self.position += 1;
811                    }
812                    if self.position > start {
813                        // Loop naturally continues to next iteration
814                    }
815                }
816                b'\r' | b'\n' => {
817                    self.consume_newline();
818
819                    // Set body_start for the FIRST pending heredoc that needs it (FIFO)
820                    // Only check if we have pending heredocs to avoid unnecessary work
821                    if !self.pending_heredocs.is_empty() {
822                        for spec in &mut self.pending_heredocs {
823                            if spec.body_start == 0 {
824                                spec.body_start = self.position;
825                                break; // Only set for the first unresolved heredoc
826                            }
827                        }
828                    }
829                }
830                b'#' => {
831                    // In ExpectDelimiter mode, '#' is a delimiter, not a comment
832                    if matches!(self.mode, LexerMode::ExpectDelimiter) {
833                        break;
834                    }
835
836                    // Skip line comment using memchr for fast newline search
837                    self.position += 1; // Skip # directly
838
839                    // Use memchr2 to find CR/LF line endings quickly (supports LF, CRLF, and CR)
840                    if let Some(newline_offset) =
841                        memchr::memchr2(b'\n', b'\r', &self.input_bytes[self.position..])
842                    {
843                        self.position += newline_offset;
844                    } else {
845                        // No newline found, skip to end
846                        self.position = self.input_bytes.len();
847                    }
848                }
849                b'=' if self.position == 0
850                    || (self.position > 0
851                        && matches!(self.input_bytes[self.position - 1], b'\n' | b'\r')) =>
852                {
853                    // Check if this starts a POD section (=pod, =head, =over, etc.)
854                    // Use byte-safe checks — avoid slicing &str at arbitrary byte positions
855                    let remaining = &self.input_bytes[self.position..];
856                    if remaining.starts_with(b"=pod")
857                        || remaining.starts_with(b"=head")
858                        || remaining.starts_with(b"=over")
859                        || remaining.starts_with(b"=item")
860                        || remaining.starts_with(b"=back")
861                        || remaining.starts_with(b"=begin")
862                        || remaining.starts_with(b"=end")
863                        || remaining.starts_with(b"=for")
864                        || remaining.starts_with(b"=encoding")
865                    {
866                        // Scan forward for \n=cut (end of POD block)
867                        let search_start = self.position;
868                        let mut found_cut = false;
869                        let bytes = self.input_bytes;
870                        let mut i = search_start;
871                        while i < bytes.len() {
872                            // Look for =cut at the start of a line
873                            if (i == 0 || matches!(bytes[i - 1], b'\n' | b'\r'))
874                                && bytes[i..].starts_with(b"=cut")
875                            {
876                                i += 4; // Skip "=cut"
877                                // Skip rest of the =cut line
878                                while i < bytes.len() && bytes[i] != b'\n' && bytes[i] != b'\r' {
879                                    i += 1;
880                                }
881                                // Consume one line ending sequence if present
882                                if i < bytes.len() && bytes[i] == b'\r' {
883                                    i += 1;
884                                    if i < bytes.len() && bytes[i] == b'\n' {
885                                        i += 1;
886                                    }
887                                } else if i < bytes.len() && bytes[i] == b'\n' {
888                                    i += 1;
889                                }
890                                self.position = i;
891                                found_cut = true;
892                                break;
893                            }
894                            i += 1;
895                        }
896                        if !found_cut {
897                            // POD extends to end of file
898                            self.position = bytes.len();
899                        }
900                        continue;
901                    }
902                    // Not a POD directive - regular '=' token
903                    break;
904                }
905                _ => {
906                    // For non-ASCII whitespace, use char check only when needed
907                    if byte >= 128
908                        && let Some(ch) = self.current_char()
909                        && ch.is_whitespace()
910                    {
911                        self.advance();
912                        continue;
913                    }
914                    break;
915                }
916            }
917        }
918        Some(())
919    }
920
921    fn try_heredoc(&mut self) -> Option<Token> {
922        // `<<` is the left-shift operator, not a heredoc, when we are inside
923        // a parenthesized expression and have just finished a term.
924        // E.g. `(1<<index(...))` — the `1` sets ExpectOperator and paren_depth > 0,
925        // so `<<index` must be the bitshift operator, not a heredoc start.
926        //
927        // We must NOT fire the guard at statement level (paren_depth == 0) because
928        // `print $fh <<END` is valid Perl: `$fh` sets ExpectOperator but `<<END`
929        // is a heredoc.  The depth check distinguishes the two cases.
930        if self.mode == LexerMode::ExpectOperator && self.paren_depth > 0 {
931            return None;
932        }
933
934        // Check for heredoc start
935        if self.peek_byte(0) != Some(b'<') || self.peek_byte(1) != Some(b'<') {
936            return None;
937        }
938
939        let start = self.position;
940        let mut text = String::from("<<");
941        self.position += 2; // Skip <<
942
943        // Check for indented heredoc (~)
944        let allow_indent = if self.current_char() == Some('~') {
945            text.push('~');
946            self.advance();
947            true
948        } else {
949            false
950        };
951
952        // Skip whitespace
953        while let Some(ch) = self.current_char() {
954            if ch == ' ' || ch == '\t' {
955                text.push(ch);
956                self.advance();
957            } else {
958                break;
959            }
960        }
961
962        // Optional backslash disables interpolation, treat like single-quoted label
963        let backslashed = if self.current_char() == Some('\\') {
964            text.push('\\');
965            self.advance();
966            true
967        } else {
968            false
969        };
970
971        // Parse delimiter
972        let delimiter = if self.position < self.input.len() {
973            match self.current_char() {
974                Some('"') if !backslashed => self.parse_quoted_heredoc_delimiter('"', &mut text)?,
975                Some('\'') if !backslashed => {
976                    self.parse_quoted_heredoc_delimiter('\'', &mut text)?
977                }
978                Some('`') if !backslashed => self.parse_quoted_heredoc_delimiter('`', &mut text)?,
979                Some(c) if is_perl_identifier_start(c) => {
980                    // Bare word delimiter
981                    let mut delim = String::new();
982                    while self.position < self.input.len() {
983                        if let Some(c) = self.current_char() {
984                            if is_perl_identifier_continue(c) {
985                                delim.push(c);
986                                text.push(c);
987                                self.advance();
988                            } else {
989                                break;
990                            }
991                        } else {
992                            break;
993                        }
994                    }
995                    delim
996                }
997                _ => {
998                    // Not a valid heredoc delimiter - reset position and return None
999                    // This allows << to be parsed as bitshift operator (e.g., 1 << 2)
1000                    self.position = start;
1001                    return None;
1002                }
1003            }
1004        } else {
1005            // No delimiter found - reset position and return None
1006            self.position = start;
1007            return None;
1008        };
1009
1010        // For now, return a placeholder token
1011        // The actual heredoc body would be parsed later when we encounter it
1012        self.mode = LexerMode::ExpectOperator;
1013
1014        // Recursion depth limit (Issue #443)
1015        if self.pending_heredocs.len() >= MAX_HEREDOC_DEPTH {
1016            return Some(Token {
1017                token_type: TokenType::Error(Arc::from("Heredoc nesting too deep")),
1018                text: Arc::from(text),
1019                start,
1020                end: self.position,
1021            });
1022        }
1023
1024        // Queue the heredoc spec with its label
1025        self.pending_heredocs.push(HeredocSpec {
1026            label: Arc::from(delimiter.as_str()),
1027            body_start: 0, // Will be set when we see the newline after this line
1028            allow_indent,
1029        });
1030
1031        Some(Token {
1032            token_type: TokenType::HeredocStart,
1033            text: Arc::from(text),
1034            start,
1035            end: self.position,
1036        })
1037    }
1038
1039    fn try_string(&mut self) -> Option<Token> {
1040        let start = self.position;
1041        let quote = self.current_char()?;
1042
1043        match quote {
1044            '"' => self.parse_double_quoted_string(start),
1045            '\'' => self.parse_single_quoted_string(start),
1046            '`' => self.parse_backtick_string(start),
1047            'q' if self.peek_char(1) == Some('{') => self.parse_q_string(start),
1048            _ => None,
1049        }
1050    }
1051
1052    #[inline]
1053    fn try_number(&mut self) -> Option<Token> {
1054        let start = self.position;
1055
1056        // Fast byte check for digits - optimized bounds checking
1057        let bytes = self.input_bytes;
1058        if self.position >= bytes.len() || !Self::byte_at(bytes, self.position).is_ascii_digit() {
1059            return None;
1060        }
1061
1062        // Check for hex (0x), binary (0b), or octal (0o) prefixes
1063        let mut pos = self.position;
1064        if Self::byte_at(bytes, pos) == b'0' && pos + 1 < bytes.len() {
1065            let prefix_byte = bytes[pos + 1];
1066            if prefix_byte == b'x' || prefix_byte == b'X' {
1067                // Hexadecimal: 0x[0-9a-fA-F_]+
1068                pos += 2; // consume '0x'
1069                let digit_start = pos;
1070                let mut saw_digit = false;
1071                while pos < bytes.len() && (bytes[pos].is_ascii_hexdigit() || bytes[pos] == b'_') {
1072                    saw_digit |= bytes[pos].is_ascii_hexdigit();
1073                    pos += 1;
1074                }
1075                if pos > digit_start && saw_digit {
1076                    self.position = pos;
1077                    let text = &self.input[start..self.position];
1078                    self.mode = LexerMode::ExpectOperator;
1079                    return Some(Token {
1080                        token_type: TokenType::Number(Arc::from(text)),
1081                        text: Arc::from(text),
1082                        start,
1083                        end: self.position,
1084                    });
1085                }
1086                // No hex digits after 0x - fall through to parse '0' as decimal
1087            } else if prefix_byte == b'b' || prefix_byte == b'B' {
1088                // Binary: 0b[01_]+
1089                pos += 2; // consume '0b'
1090                let digit_start = pos;
1091                let mut saw_digit = false;
1092                while pos < bytes.len()
1093                    && (bytes[pos] == b'0' || bytes[pos] == b'1' || bytes[pos] == b'_')
1094                {
1095                    saw_digit |= bytes[pos] == b'0' || bytes[pos] == b'1';
1096                    pos += 1;
1097                }
1098                if pos > digit_start && saw_digit {
1099                    self.position = pos;
1100                    let text = &self.input[start..self.position];
1101                    self.mode = LexerMode::ExpectOperator;
1102                    return Some(Token {
1103                        token_type: TokenType::Number(Arc::from(text)),
1104                        text: Arc::from(text),
1105                        start,
1106                        end: self.position,
1107                    });
1108                }
1109                // No binary digits after 0b - fall through to parse '0' as decimal
1110            } else if prefix_byte == b'o' || prefix_byte == b'O' {
1111                // Octal (explicit): 0o[0-7_]+
1112                pos += 2; // consume '0o'
1113                let digit_start = pos;
1114                let mut saw_digit = false;
1115                while pos < bytes.len()
1116                    && ((bytes[pos] >= b'0' && bytes[pos] <= b'7') || bytes[pos] == b'_')
1117                {
1118                    saw_digit |= (b'0'..=b'7').contains(&bytes[pos]);
1119                    pos += 1;
1120                }
1121                if pos > digit_start && saw_digit {
1122                    self.position = pos;
1123                    let text = &self.input[start..self.position];
1124                    self.mode = LexerMode::ExpectOperator;
1125                    return Some(Token {
1126                        token_type: TokenType::Number(Arc::from(text)),
1127                        text: Arc::from(text),
1128                        start,
1129                        end: self.position,
1130                    });
1131                }
1132                // No octal digits after 0o - fall through to parse '0' as decimal
1133            }
1134        }
1135
1136        // Consume initial digits - unrolled for better performance
1137        pos = self.position;
1138        while pos < bytes.len() {
1139            let byte = Self::byte_at(bytes, pos);
1140            if byte.is_ascii_digit() || byte == b'_' {
1141                pos += 1;
1142            } else {
1143                break;
1144            }
1145        }
1146        self.position = pos;
1147
1148        // Check for decimal point - optimized with single bounds check
1149        if pos < bytes.len() && Self::byte_at(bytes, pos) == b'.' {
1150            // Peek ahead to see what follows the dot
1151            let has_following_digit = pos + 1 < bytes.len() && bytes[pos + 1].is_ascii_digit();
1152
1153            // Optimized dot consumption logic
1154            let should_consume_dot = has_following_digit || {
1155                pos + 1 >= bytes.len() || {
1156                    // Use bitwise operations for faster character classification
1157                    let next_byte = bytes[pos + 1];
1158                    // Whitespace, delimiters, operators - optimized check
1159                    next_byte <= b' '
1160                        || matches!(
1161                            next_byte,
1162                            b';' | b','
1163                                | b')'
1164                                | b'}'
1165                                | b']'
1166                                | b'+'
1167                                | b'-'
1168                                | b'*'
1169                                | b'/'
1170                                | b'%'
1171                                | b'='
1172                                | b'<'
1173                                | b'>'
1174                                | b'!'
1175                                | b'&'
1176                                | b'|'
1177                                | b'^'
1178                                | b'~'
1179                                | b'e'
1180                                | b'E'
1181                        )
1182                }
1183            };
1184
1185            if should_consume_dot {
1186                pos += 1; // consume the dot
1187                // Consume fractional digits - batch processing
1188                while pos < bytes.len() && (bytes[pos].is_ascii_digit() || bytes[pos] == b'_') {
1189                    pos += 1;
1190                }
1191                self.position = pos;
1192            }
1193        }
1194
1195        // Check for exponent - optimized
1196        if pos < bytes.len() && (bytes[pos] == b'e' || bytes[pos] == b'E') {
1197            let exp_start = pos;
1198            pos += 1; // consume 'e' or 'E'
1199
1200            // Check for optional sign
1201            if pos < bytes.len() && (bytes[pos] == b'+' || bytes[pos] == b'-') {
1202                pos += 1;
1203            }
1204
1205            // Must have at least one digit after exponent (underscores allowed between digits)
1206            let mut saw_digit = false;
1207            while pos < bytes.len() {
1208                let byte = bytes[pos];
1209                if byte.is_ascii_digit() {
1210                    saw_digit = true;
1211                    pos += 1;
1212                } else if byte == b'_' {
1213                    pos += 1;
1214                } else {
1215                    break;
1216                }
1217            }
1218
1219            // If no digits after exponent, backtrack
1220            if !saw_digit {
1221                pos = exp_start;
1222            }
1223
1224            self.position = pos;
1225        }
1226
1227        // Avoid string slicing for common number cases - use Arc::from directly on slice
1228        let text = &self.input[start..self.position];
1229        self.mode = LexerMode::ExpectOperator;
1230
1231        Some(Token {
1232            token_type: TokenType::Number(Arc::from(text)),
1233            text: Arc::from(text),
1234            start,
1235            end: self.position,
1236        })
1237    }
1238
1239    fn parse_decimal_number(&mut self, start: usize) -> Option<Token> {
1240        // We're at the dot, consume it
1241        self.advance();
1242
1243        // Parse the fractional part
1244        while self.position < self.input_bytes.len() {
1245            let byte = self.input_bytes[self.position];
1246            match byte {
1247                b'0'..=b'9' | b'_' => self.position += 1,
1248                b'e' | b'E' => {
1249                    // Handle scientific notation.
1250                    // Save the position of 'e'/'E' so we can backtrack here if
1251                    // no digits follow the exponent marker (with or without sign).
1252                    let e_pos = self.position;
1253                    self.advance();
1254                    if self.position < self.input_bytes.len() {
1255                        let next = self.input_bytes[self.position];
1256                        if next == b'+' || next == b'-' {
1257                            self.advance();
1258                        }
1259                    }
1260                    // Parse exponent digits (underscores allowed between digits)
1261                    let exponent_start = self.position;
1262                    let mut saw_digit = false;
1263                    while self.position < self.input_bytes.len() {
1264                        let byte = self.input_bytes[self.position];
1265                        if byte.is_ascii_digit() {
1266                            saw_digit = true;
1267                            self.position += 1;
1268                        } else if byte == b'_' {
1269                            self.position += 1;
1270                        } else {
1271                            break;
1272                        }
1273                    }
1274
1275                    // No digits after exponent marker — backtrack to just before
1276                    // 'e'/'E' so the caller sees it as a separate token.
1277                    // Using e_pos (not exponent_start-1) avoids including 'e' in
1278                    // the number slice when a sign character was consumed.
1279                    if !saw_digit {
1280                        let _ = exponent_start; // mark as intentionally unused
1281                        self.position = e_pos;
1282                    }
1283                    break;
1284                }
1285                _ => break,
1286            }
1287        }
1288
1289        let text = &self.input[start..self.position];
1290        self.mode = LexerMode::ExpectOperator;
1291
1292        Some(Token {
1293            token_type: TokenType::Number(Arc::from(text)),
1294            text: Arc::from(text),
1295            start,
1296            end: self.position,
1297        })
1298    }
1299
1300    fn try_variable(&mut self) -> Option<Token> {
1301        let start = self.position;
1302        let sigil = self.current_char()?;
1303
1304        match sigil {
1305            '$' | '@' | '%' | '*' => {
1306                // In ExpectOperator mode, treat % and * as operators rather than sigils
1307                if self.mode == LexerMode::ExpectOperator && matches!(sigil, '*' | '%') {
1308                    return None;
1309                }
1310                self.advance();
1311
1312                // Special case: After ->, sigils followed by { or [ should be tokenized separately
1313                // This is for postfix dereference like ->@*, ->%{}, ->@[]
1314                // We need to be careful with Unicode - check if we have enough bytes and valid char boundaries
1315                let check_arrow = self.position >= 3
1316                    && self.position.saturating_sub(1) <= self.input.len()
1317                    && self.input.is_char_boundary(self.position.saturating_sub(3))
1318                    && self.input.is_char_boundary(self.position.saturating_sub(1));
1319
1320                if check_arrow
1321                    && {
1322                        let saved = self.position;
1323                        self.position -= 3;
1324                        let arrow = self.matches_bytes(b"->");
1325                        self.position = saved;
1326                        arrow
1327                    }
1328                    && matches!(self.current_char(), Some('{' | '[' | '*'))
1329                {
1330                    // Just return the sigil
1331                    let text = &self.input[start..self.position];
1332                    self.mode = LexerMode::ExpectOperator;
1333
1334                    return Some(Token {
1335                        token_type: TokenType::Identifier(Arc::from(text)),
1336                        text: Arc::from(text),
1337                        start,
1338                        end: self.position,
1339                    });
1340                }
1341
1342                // Check for $# (array length operator)
1343                if sigil == '$' && self.current_char() == Some('#') {
1344                    self.advance(); // consume #
1345                    // Now parse the array name
1346                    while let Some(ch) = self.current_char() {
1347                        if is_perl_identifier_continue(ch) {
1348                            self.advance();
1349                        } else if ch == ':' && self.peek_char(1) == Some(':') {
1350                            // Package-qualified array name
1351                            self.advance();
1352                            self.advance();
1353                        } else {
1354                            break;
1355                        }
1356                    }
1357
1358                    let text = &self.input[start..self.position];
1359                    self.mode = LexerMode::ExpectOperator;
1360                    // $#foo is a complete variable token; a following `{` is a subscript.
1361                    self.after_var_subscript = true;
1362
1363                    return Some(Token {
1364                        token_type: TokenType::Identifier(Arc::from(text)),
1365                        text: Arc::from(text),
1366                        start,
1367                        end: self.position,
1368                    });
1369                }
1370
1371                // Check for special cases like ${^MATCH} or ${::{foo}} or *{$glob}
1372                if self.current_char() == Some('{') {
1373                    // Peek ahead to decide if we should consume the brace
1374                    let next_char = self.peek_char(1);
1375
1376                    // Check if this is a dereference like @{$ref} or @{[...]}
1377                    // If the next char suggests dereference, don't consume the brace.
1378                    // For @ and % sigils, identifiers inside braces are also derefs
1379                    // (e.g. @{Foo::Bar::baz} or %{Some::Hash}).
1380                    let is_deref = sigil != '*'
1381                        && (matches!(
1382                            next_char,
1383                            Some('$' | '@' | '%' | '*' | '&' | '[' | ' ' | '\t' | '\n' | '\r',)
1384                        ) || (matches!(sigil, '@' | '%')
1385                            && next_char.is_some_and(is_perl_identifier_start)));
1386                    if is_deref {
1387                        // This is a dereference, don't consume the brace
1388                        let text = &self.input[start..self.position];
1389                        self.mode = LexerMode::ExpectOperator;
1390                        // A standalone sigil token before `{` starts a dereference
1391                        // sequence (e.g. `${$ref}` / `@{$aref}` / `%{$href}` / `&{$cref}`).
1392                        // Mark it as subscript-capable so `{` increments brace depth
1393                        // and the closing `}` can enable chained `{...}` subscripts.
1394                        // (Broader form than master's `$|@|%` filter — `*` is already
1395                        // excluded by the `is_deref` guard above and `&` deref also
1396                        // benefits from chained-subscript handling.)
1397                        self.after_var_subscript = true;
1398
1399                        return Some(Token {
1400                            token_type: TokenType::Identifier(Arc::from(text)),
1401                            text: Arc::from(text),
1402                            start,
1403                            end: self.position,
1404                        });
1405                    }
1406
1407                    self.advance(); // consume {
1408
1409                    // Handle special variables with caret
1410                    if self.current_char() == Some('^') {
1411                        self.advance(); // consume ^
1412                        // Parse the special variable name
1413                        while let Some(ch) = self.current_char() {
1414                            if ch == '}' {
1415                                self.advance(); // consume }
1416                                break;
1417                            } else if is_perl_identifier_continue(ch) {
1418                                self.advance();
1419                            } else {
1420                                break;
1421                            }
1422                        }
1423                    }
1424                    // Handle stash access like $::{foo}
1425                    else if self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1426                        self.advance(); // consume first :
1427                        self.advance(); // consume second :
1428                        // Skip optional { and }
1429                        if self.current_char() == Some('{') {
1430                            self.advance();
1431                        }
1432                        // Parse the name
1433                        while let Some(ch) = self.current_char() {
1434                            if ch == '}' {
1435                                self.advance();
1436                                if self.current_char() == Some('}') {
1437                                    self.advance(); // consume closing } of ${...}
1438                                }
1439                                break;
1440                            } else if is_perl_identifier_continue(ch) {
1441                                self.advance();
1442                            } else {
1443                                break;
1444                            }
1445                        }
1446                    }
1447                    // Regular braced variable like ${foo} or glob like *{$glob}
1448                    else {
1449                        // Check if this is a dereference like ${$ref} or @{$ref} or @{[...]}
1450                        // If the next char is a sigil or other expression starter, we should stop here and let the parser handle it
1451                        // EXCEPT for globs - *{$glob} should be parsed as one token
1452                        // Also check for empty braces or EOF - in these cases we should split the tokens
1453                        if sigil != '*'
1454                            && (matches!(
1455                                self.current_char(),
1456                                Some(
1457                                    '$' | '@'
1458                                        | '%'
1459                                        | '*'
1460                                        | '&'
1461                                        | '['
1462                                        | ' '
1463                                        | '\t'
1464                                        | '\n'
1465                                        | '\r'
1466                                        | '}'
1467                                )
1468                            ) || self.current_char().is_none())
1469                        {
1470                            // This is a dereference or empty/invalid brace, backtrack
1471                            self.position = start + 1; // Just past the sigil
1472                            let text = &self.input[start..self.position];
1473                            self.mode = LexerMode::ExpectOperator;
1474                            // Same as above: sigil-only token means a dereference opener.
1475                            self.after_var_subscript = true;
1476
1477                            return Some(Token {
1478                                token_type: TokenType::Identifier(Arc::from(text)),
1479                                text: Arc::from(text),
1480                                start,
1481                                end: self.position,
1482                            });
1483                        }
1484
1485                        // For glob access, we need to consume everything inside braces
1486                        if sigil == '*' {
1487                            let mut brace_depth: usize = 1;
1488                            while let Some(ch) = self.current_char() {
1489                                if ch == '{' {
1490                                    brace_depth += 1;
1491                                } else if ch == '}' {
1492                                    brace_depth = brace_depth.saturating_sub(1);
1493                                    if brace_depth == 0 {
1494                                        self.advance(); // consume final }
1495                                        break;
1496                                    }
1497                                }
1498                                self.advance();
1499                            }
1500                        } else {
1501                            // Regular variable
1502                            while let Some(ch) = self.current_char() {
1503                                if ch == '}' {
1504                                    self.advance(); // consume }
1505                                    break;
1506                                } else if is_perl_identifier_continue(ch) {
1507                                    self.advance();
1508                                } else {
1509                                    break;
1510                                }
1511                            }
1512                        }
1513                    }
1514                }
1515                // Parse regular variable name
1516                else if let Some(ch) = self.current_char() {
1517                    if is_perl_identifier_start(ch) {
1518                        while let Some(ch) = self.current_char() {
1519                            if is_perl_identifier_continue(ch) {
1520                                self.advance();
1521                            } else {
1522                                break;
1523                            }
1524                        }
1525                        // Handle package-qualified segments like Foo::bar
1526                        while self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1527                            self.advance();
1528                            self.advance();
1529                            while let Some(ch) = self.current_char() {
1530                                if is_perl_identifier_continue(ch) {
1531                                    self.advance();
1532                                } else {
1533                                    break;
1534                                }
1535                            }
1536                        }
1537                    }
1538                    // Handle $^Letter (e.g. $^W, $^O, $^X) and bare $^ (format_top_name)
1539                    // Not inside prototypes where ^ is a literal prototype char
1540                    else if sigil == '$' && ch == '^' && !self.in_prototype {
1541                        self.advance(); // consume ^
1542                        // $^Letter: consume the single uppercase letter
1543                        if let Some(letter) = self.current_char()
1544                            && letter.is_ascii_uppercase()
1545                        {
1546                            self.advance();
1547                        }
1548                        // bare $^ (no uppercase letter follows): format_top_name — stop here
1549                    }
1550                    // Handle special punctuation variables
1551                    // Not inside prototypes where ; and , are literal prototype chars
1552                    else if sigil == '$'
1553                        && !self.in_prototype
1554                        && matches!(
1555                            ch,
1556                            '?' | '!'
1557                                | '@'
1558                                | '&'
1559                                | '`'
1560                                | '\''
1561                                | '.'
1562                                | '/'
1563                                | '\\'
1564                                | '|'
1565                                | '+'
1566                                | '-'
1567                                | '['
1568                                | ']'
1569                                | '$'
1570                                | '~'
1571                                | '='
1572                                | '%'
1573                                | ','
1574                                | '"'
1575                                | ';'
1576                                | '>'
1577                                | '<'
1578                                | ')'
1579                                | '(' // $( = real group ID of this process
1580                        )
1581                    {
1582                        self.advance(); // consume the special character
1583                    }
1584                    // $$ is the PID special variable, but only when it is not immediately
1585                    // followed by an identifier-start character. $$var is scalar dereference
1586                    // of $var, so keep the second $ for the next token.
1587                    else if sigil == '$' && ch == '$' {
1588                        if !self.peek_char(1).is_some_and(is_perl_identifier_start) {
1589                            self.advance(); // consume the second $ for bare $$ PID
1590                        }
1591                    }
1592                    // Handle special array/hash punctuation variables
1593                    else if (sigil == '@' || sigil == '%') && matches!(ch, '+' | '-') {
1594                        self.advance(); // consume the + or -
1595                    }
1596                }
1597
1598                let text = &self.input[start..self.position];
1599                self.mode = LexerMode::ExpectOperator;
1600                // A complete $foo, @foo, %foo token can be followed by a hash/slice
1601                // subscript `{`. Set the flag so the `{` handler knows to increment
1602                // hash_brace_depth. Glob tokens (*foo) are excluded: they don't take
1603                // hash subscripts in the same way.
1604                self.after_var_subscript = matches!(sigil, '$' | '@' | '%');
1605
1606                Some(Token {
1607                    token_type: TokenType::Identifier(Arc::from(text)),
1608                    text: Arc::from(text),
1609                    start,
1610                    end: self.position,
1611                })
1612            }
1613            _ => None,
1614        }
1615    }
1616
1617    /// Return the next non-space char and the char immediately following it (without consuming).
1618    /// Used to detect quote-operator delimiters while distinguishing `=>` (fat-arrow autoquote)
1619    /// from `=` used as a plain delimiter.
1620    fn peek_nonspace_and_following(&self) -> (Option<char>, Option<char>) {
1621        let mut i = self.position;
1622        while i < self.input.len() {
1623            let c = match self.input.get(i..).and_then(|s| s.chars().next()) {
1624                Some(c) => c,
1625                None => return (None, None),
1626            };
1627            if c.is_whitespace() {
1628                i += c.len_utf8();
1629                continue;
1630            }
1631            // Found non-space at position i; peek the next char after it
1632            let j = i + c.len_utf8();
1633            let following = self.input.get(j..).and_then(|s| s.chars().next());
1634            return (Some(c), following);
1635        }
1636        (None, None)
1637    }
1638
1639    /// Is `c` a valid quote-like delimiter? (non-alnum, including paired)
1640    fn is_quote_delim(c: char) -> bool {
1641        // Perl allows any non-alphanumeric, non-whitespace character as delimiter,
1642        // including control characters (e.g. s\x07pattern\x07replacement\x07).
1643        !c.is_ascii_alphanumeric() && !c.is_whitespace()
1644    }
1645
1646    /// Try to parse a v-string (version string) like `v5.26.0` or `v5.10`.
1647    ///
1648    /// A v-string starts with `v` followed by one or more digits, then optionally
1649    /// `.` followed by digits, repeated. The `v` prefix distinguishes these from
1650    /// normal identifiers. Examples: `v5.26.0`, `v5.10`, `v1.2.3.4`.
1651    #[inline]
1652    fn try_vstring(&mut self) -> Option<Token> {
1653        let start = self.position;
1654        let bytes = self.input_bytes;
1655
1656        // Must start with 'v' followed by at least one digit
1657        if start >= bytes.len() || bytes[start] != b'v' {
1658            return None;
1659        }
1660
1661        let next_pos = start + 1;
1662        if next_pos >= bytes.len() || !bytes[next_pos].is_ascii_digit() {
1663            return None;
1664        }
1665
1666        // We have `v` followed by a digit — scan the rest of the v-string.
1667        // Pattern: v DIGITS (.DIGITS)*
1668        let mut pos = next_pos;
1669
1670        // Consume leading digits
1671        while pos < bytes.len() && bytes[pos].is_ascii_digit() {
1672            pos += 1;
1673        }
1674
1675        // Consume optional `.DIGITS` segments (require at least one digit after dot)
1676        while pos < bytes.len() && bytes[pos] == b'.' {
1677            let dot_pos = pos;
1678            pos += 1; // skip '.'
1679
1680            if pos >= bytes.len() || !bytes[pos].is_ascii_digit() {
1681                // Dot not followed by digit — not part of the v-string
1682                pos = dot_pos;
1683                break;
1684            }
1685
1686            // Consume digits after the dot
1687            while pos < bytes.len() && bytes[pos].is_ascii_digit() {
1688                pos += 1;
1689            }
1690        }
1691
1692        // Make sure the v-string isn't followed by identifier-continuation characters
1693        // (e.g. `v5x` should remain an identifier, not a v-string `v5` + `x`)
1694        if pos < bytes.len() {
1695            let next_byte = bytes[pos];
1696            if next_byte == b'_' || next_byte.is_ascii_alphabetic() {
1697                return None;
1698            }
1699            // Also check for non-ASCII identifier continuations
1700            if next_byte >= 128
1701                && let Some(ch) = self.input.get(pos..).and_then(|s| s.chars().next())
1702                && is_perl_identifier_continue(ch)
1703            {
1704                return None;
1705            }
1706        }
1707
1708        // `v5` (no dots) is a valid Perl v-string meaning chr(5).
1709        let text = &self.input[start..pos];
1710
1711        self.position = pos;
1712        self.mode = LexerMode::ExpectOperator;
1713
1714        Some(Token {
1715            token_type: TokenType::Version(Arc::from(text)),
1716            text: Arc::from(text),
1717            start,
1718            end: self.position,
1719        })
1720    }
1721
1722    #[inline]
1723    fn try_identifier_or_keyword(&mut self) -> Option<Token> {
1724        let start = self.position;
1725        let ch = self.current_char()?;
1726        let bytes = self.input_bytes;
1727        let len = bytes.len();
1728
1729        if is_perl_identifier_start(ch) {
1730            // Special case: substitution/transliteration with single-quote delimiter
1731            // The single quote is considered an identifier continuation, so we need to
1732            // detect these operators before consuming it as part of an identifier.
1733            if !self.after_arrow
1734                && self.hash_brace_depth == 0
1735                && ch == 's'
1736                && self.peek_char(1) == Some('\'')
1737            {
1738                self.advance(); // consume 's'
1739                return self.parse_substitution(start);
1740            } else if !self.after_arrow
1741                && self.hash_brace_depth == 0
1742                && ch == 'y'
1743                && self.peek_char(1) == Some('\'')
1744            {
1745                self.advance(); // consume 'y'
1746                return self.parse_transliteration(start);
1747            } else if !self.after_arrow
1748                && self.hash_brace_depth == 0
1749                && ch == 't'
1750                && self.peek_char(1) == Some('r')
1751                && self.peek_char(2) == Some('\'')
1752            {
1753                self.advance(); // consume 't'
1754                self.advance(); // consume 'r'
1755                return self.parse_transliteration(start);
1756            }
1757
1758            // Fast ASCII path for identifier continuation.
1759            while self.position < len {
1760                let byte = bytes[self.position];
1761                if byte == b'\'' && is_quote_op_word_prefix(&bytes[start..self.position]) {
1762                    // Keep apostrophe for quote-operator parsing in cases like q'...'.
1763                    break;
1764                }
1765
1766                if byte.is_ascii_alphanumeric() || byte == b'_' || byte == b'\'' {
1767                    self.position += 1;
1768                    continue;
1769                }
1770
1771                if byte < 128 {
1772                    break;
1773                }
1774
1775                if let Some(ch) = self.current_char()
1776                    && is_perl_identifier_continue(ch)
1777                {
1778                    self.advance();
1779                    continue;
1780                }
1781                break;
1782            }
1783            // Handle package-qualified identifiers like Foo::bar.
1784            while self.config.max_lookahead >= 1
1785                && self.position + 1 < len
1786                && bytes[self.position] == b':'
1787                && bytes[self.position + 1] == b':'
1788            {
1789                self.position += 2; // consume '::'
1790
1791                // consume following identifier segment if present
1792                let Some(ch) = self.current_char() else {
1793                    break;
1794                };
1795                if !is_perl_identifier_start(ch) {
1796                    break;
1797                }
1798                self.advance();
1799                while self.position < len {
1800                    let byte = bytes[self.position];
1801                    if byte.is_ascii_alphanumeric() || byte == b'_' || byte == b'\'' {
1802                        self.position += 1;
1803                        continue;
1804                    }
1805                    if byte < 128 {
1806                        break;
1807                    }
1808                    if let Some(ch) = self.current_char()
1809                        && is_perl_identifier_continue(ch)
1810                    {
1811                        self.advance();
1812                        continue;
1813                    }
1814                    break;
1815                }
1816            }
1817
1818            let text = &self.input[start..self.position];
1819
1820            // Check for __DATA__ and __END__ markers using exact match
1821            // Only recognize these in code channel, not inside data/format sections or heredocs
1822            let in_code_channel =
1823                !matches!(self.mode, LexerMode::InDataSection | LexerMode::InFormatBody)
1824                    && self.pending_heredocs.is_empty();
1825
1826            let marker = if in_code_channel {
1827                if text == "__DATA__" {
1828                    Some("__DATA__")
1829                } else if text == "__END__" {
1830                    Some("__END__")
1831                } else {
1832                    None
1833                }
1834            } else {
1835                None
1836            };
1837
1838            if let Some(marker_text) = marker {
1839                // These must be at the beginning of a line
1840                // Use the after_newline flag to determine if we're at line start
1841                if self.after_newline {
1842                    // Check if rest of line is only whitespace
1843                    // Only treat as data marker if line has no trailing junk
1844                    if Self::trailing_ws_only(self.input_bytes, self.position) {
1845                        // Consume the rest of the line (the marker line)
1846                        while self.position < self.input.len()
1847                            && self.input_bytes[self.position] != b'\n'
1848                            && self.input_bytes[self.position] != b'\r'
1849                        {
1850                            self.advance();
1851                        }
1852                        self.consume_newline();
1853
1854                        // Switch to data section mode
1855                        self.mode = LexerMode::InDataSection;
1856
1857                        return Some(Token {
1858                            token_type: TokenType::DataMarker(Arc::from(marker_text)),
1859                            text: Arc::from(marker_text),
1860                            start,
1861                            end: self.position,
1862                        });
1863                    }
1864                }
1865            }
1866
1867            // Check for substitution/transliteration operators
1868            // Skip if after '->'  -- these are method names, not operators.
1869            #[allow(clippy::collapsible_if)]
1870            if !self.after_arrow && self.hash_brace_depth == 0 && matches!(text, "s" | "tr" | "y") {
1871                let immediate = self.current_char();
1872                let (candidate, char_after_next, has_whitespace) =
1873                    if immediate.is_some_and(|c| c.is_whitespace()) {
1874                        let (nc, ca) = self.peek_nonspace_and_following();
1875                        (nc, ca, true)
1876                    } else {
1877                        let following = immediate.and_then(|c| {
1878                            let j = self.position + c.len_utf8();
1879                            self.input.get(j..).and_then(|s| s.chars().next())
1880                        });
1881                        (immediate, following, false)
1882                    };
1883
1884                if let Some(next) = candidate {
1885                    // `s => 1` should remain a fat-arrow hash key, not quote op.
1886                    let is_fat_arrow = next == '=' && char_after_next == Some('>');
1887                    let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
1888                    let is_quote_char = matches!(next, '\'' | '"') && text != "s";
1889                    let transliteration_allows_whitespace = text == "tr" || text == "y";
1890                    let substitution_disallows_whitespace = text == "s" && has_whitespace;
1891                    let is_valid_delim = Self::is_quote_delim(next)
1892                        && !is_fat_arrow
1893                        && !substitution_disallows_whitespace
1894                        && (!has_whitespace
1895                            || is_paired_delim
1896                            || is_quote_char
1897                            || transliteration_allows_whitespace);
1898
1899                    if is_valid_delim {
1900                        match text {
1901                            "s" => return self.parse_substitution(start),
1902                            "tr" | "y" => return self.parse_transliteration(start),
1903                            unexpected => {
1904                                return Some(Token {
1905                                    token_type: TokenType::Error(Arc::from(format!(
1906                                        "Unexpected substitution operator '{}': expected 's', 'tr', or 'y' at position {}",
1907                                        unexpected, start
1908                                    ))),
1909                                    text: Arc::from(unexpected),
1910                                    start,
1911                                    end: self.position,
1912                                });
1913                            }
1914                        }
1915                    }
1916                }
1917            }
1918
1919            let token_type = if is_keyword_fast(text) {
1920                // Check for special keywords that affect lexer mode
1921                match text {
1922                    "if" | "unless" | "while" | "until" | "for" | "foreach" | "grep" | "map"
1923                    | "sort" | "split" | "and" | "or" | "xor" | "not"
1924                    // These keywords introduce an expression, so a following `/` is a
1925                    // regex, not division.  `return /re/`, `die /re/`, `warn /re/`,
1926                    // `do /file/`, and `eval /re/` are all valid Perl.
1927                    | "return" | "die" | "warn" | "do" | "eval" => {
1928                        self.mode = LexerMode::ExpectTerm;
1929                    }
1930                    "sub" => {
1931                        self.after_sub = true;
1932                        self.mode = LexerMode::ExpectTerm;
1933                    }
1934                    // Quote operators expect a delimiter next.
1935                    // Skip if after '->' -- these are method names, not operators.
1936                    // Skip inside hash subscript braces (hash_brace_depth > 0) — all
1937                    // positions inside `$h{...}` or `@h{...}` treat quote-op names as
1938                    // bareword keys, including after commas in slices like `@h{m, s}`.
1939                    op if !self.after_arrow
1940                        && self.hash_brace_depth == 0
1941                        && quote_handler::is_quote_operator(op) =>
1942                    {
1943                        // Perl allows whitespace between a quote-like operator and its delimiter,
1944                        // but ONLY for paired delimiters (s { ... } { ... }g).
1945                        // For non-paired delimiters (s/foo/bar/, s,foo,bar,), the delimiter
1946                        // must be immediately adjacent — otherwise `s $foo` would wrongly
1947                        // treat `$` as a delimiter instead of being a bareword `s` followed
1948                        // by a scalar variable.
1949                        //
1950                        // Strategy:
1951                        //   1. Check the immediately-adjacent char first (no whitespace skip).
1952                        //      If it is a valid delimiter → any non-alnum, non-whitespace char.
1953                        //   2. If the adjacent char is whitespace, peek past it.
1954                        //      Only accept PAIRED delimiters ({, [, (, <) in that case.
1955                        let immediate = self.current_char();
1956                        let (candidate, char_after_next, has_whitespace) =
1957                            if immediate.is_some_and(|c| c.is_whitespace()) {
1958                                // There is whitespace — peek past it
1959                                let (nc, ca) = self.peek_nonspace_and_following();
1960                                (nc, ca, true)
1961                            } else {
1962                                // No whitespace — use immediate char
1963                                let following = immediate.and_then(|c| {
1964                                    let j = self.position + c.len_utf8();
1965                                    self.input.get(j..).and_then(|s| s.chars().next())
1966                                });
1967                                (immediate, following, false)
1968                            };
1969
1970                        if let Some(next) = candidate {
1971                            // Fat-arrow autoquoting: `s => value` — `=` followed by `>` is '=>',
1972                            // not a valid substitution delimiter. Treat as identifier.
1973                            let is_fat_arrow = next == '=' && char_after_next == Some('>');
1974
1975                            // When whitespace precedes the delimiter, only unambiguous
1976                            // delimiters are accepted:
1977                            //   - Paired delimiters ({, [, (, <) are always safe.
1978                            //   - ' and " are safe for all operators EXCEPT `s` — `-s 'filename'`
1979                            //     is a valid file-size filetest and must not be treated as a
1980                            //     substitution start. All other operators (qw, q, qq, qr, qx, m,
1981                            //     tr, y) have no corresponding file-test operator.
1982                            //   - Non-paired, non-quote chars ($, @, ,, etc.) remain rejected.
1983                            let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
1984                            let is_quote_char = matches!(next, '\'' | '"') && op != "s";
1985                            let is_valid_delim = Self::is_quote_delim(next)
1986                                && !is_fat_arrow
1987                                && (!has_whitespace || is_paired_delim || is_quote_char);
1988
1989                            if is_valid_delim {
1990                                self.mode = LexerMode::ExpectDelimiter;
1991                                self.current_quote_op = Some(quote_handler::QuoteOperatorInfo {
1992                                    operator: op.to_string(),
1993                                    delimiter: '\0', // Will be set when we see the delimiter
1994                                    start_pos: start,
1995                                });
1996
1997                                // Don't return a keyword token - continue to parse the delimiter
1998                                // Skip any whitespace between operator and delimiter
1999                                while let Some(ch) = self.current_char() {
2000                                    if ch.is_whitespace() {
2001                                        self.advance();
2002                                    } else {
2003                                        break;
2004                                    }
2005                                }
2006
2007                                // Get the delimiter
2008                                #[allow(clippy::collapsible_if)]
2009                                if let Some(delim) = self.current_char() {
2010                                    if !delim.is_alphanumeric() {
2011                                        self.advance();
2012                                        if let Some(ref mut info) = self.current_quote_op {
2013                                            info.delimiter = delim;
2014                                        }
2015                                        // Parse the quote operator content and return the complete token
2016                                        return self.parse_quote_operator(delim);
2017                                    }
2018                                }
2019                            } else {
2020                                // Not a quote operator here → treat as IDENTIFIER
2021                                self.current_quote_op = None;
2022                                self.mode = LexerMode::ExpectOperator;
2023                                return Some(Token {
2024                                    token_type: TokenType::Identifier(Arc::from(text)),
2025                                    start,
2026                                    end: self.position,
2027                                    text: Arc::from(text),
2028                                });
2029                            }
2030                        } else {
2031                            // End-of-input after the word → also treat as IDENTIFIER
2032                            self.current_quote_op = None;
2033                            self.mode = LexerMode::ExpectOperator;
2034                            return Some(Token {
2035                                token_type: TokenType::Identifier(Arc::from(text)),
2036                                start,
2037                                end: self.position,
2038                                text: Arc::from(text),
2039                            });
2040                        }
2041                        // If we get here but haven't returned, something went wrong
2042                        // Fall through to treat as identifier
2043                        self.current_quote_op = None;
2044                        self.mode = LexerMode::ExpectOperator;
2045                        return Some(Token {
2046                            token_type: TokenType::Identifier(Arc::from(text)),
2047                            start,
2048                            end: self.position,
2049                            text: Arc::from(text),
2050                        });
2051                    }
2052                    // Format declarations need special handling
2053                    "format" => {
2054                        // We'll need to check for the = after the format name
2055                        // For now, just mark that we saw format
2056                    }
2057                    _ if is_builtin_function(text) => {
2058                        // Bare builtins are term-introducing in Perl.
2059                        self.mode = LexerMode::ExpectTerm;
2060                    }
2061                    _ => {
2062                        self.mode = LexerMode::ExpectOperator;
2063                    }
2064                }
2065                TokenType::Keyword(Arc::from(text))
2066            } else {
2067                // Mirror parser bare-builtin handling so `/` after builtins like
2068                // `join` or `print` is lexed as a regex term, not division.
2069                if is_builtin_function(text) {
2070                    self.mode = LexerMode::ExpectTerm;
2071                } else {
2072                    self.mode = LexerMode::ExpectOperator;
2073                }
2074                TokenType::Identifier(Arc::from(text))
2075            };
2076
2077            self.after_arrow = false;
2078            // A keyword/identifier is not a variable; `{` after it is a block opener.
2079            self.after_var_subscript = false;
2080            // hash_brace_depth is managed by { and } handlers, not cleared per-token
2081            Some(Token { token_type, text: Arc::from(text), start, end: self.position })
2082        } else {
2083            None
2084        }
2085    }
2086
2087    /// Parse data section body - consumes everything to EOF
2088    fn parse_data_body(&mut self) -> Option<Token> {
2089        if self.position >= self.input.len() {
2090            // Already at EOF
2091            self.mode = LexerMode::ExpectTerm;
2092            return Some(Token {
2093                token_type: TokenType::EOF,
2094                text: Arc::from(""),
2095                start: self.position,
2096                end: self.position,
2097            });
2098        }
2099
2100        let start = self.position;
2101        // Consume everything to EOF
2102        let body = &self.input[self.position..];
2103        self.position = self.input.len();
2104
2105        // Reset mode for next parse (though we're at EOF)
2106        self.mode = LexerMode::ExpectTerm;
2107
2108        Some(Token {
2109            token_type: TokenType::DataBody(Arc::from(body)),
2110            text: Arc::from(body),
2111            start,
2112            end: self.position,
2113        })
2114    }
2115
2116    /// Parse format body - consumes until a line with just a dot
2117    fn parse_format_body(&mut self) -> Option<Token> {
2118        let start = self.position;
2119        let mut body = String::new();
2120        let mut line_start = true;
2121
2122        while self.position < self.input.len() {
2123            // Check if we're at the start of a line and the next char is a dot
2124            if line_start && self.current_char() == Some('.') {
2125                // Check if this line contains only a dot
2126                let mut peek_pos = self.position + 1;
2127                let mut found_terminator = true;
2128
2129                // Skip any trailing whitespace on the dot line
2130                while peek_pos < self.input.len() {
2131                    match self.input_bytes[peek_pos] {
2132                        b' ' | b'\t' | b'\r' => peek_pos += 1,
2133                        b'\n' => break,
2134                        _ => {
2135                            found_terminator = false;
2136                            break;
2137                        }
2138                    }
2139                }
2140
2141                if found_terminator {
2142                    // We found the terminating dot, consume it
2143                    self.position = peek_pos;
2144                    if self.position < self.input.len() && self.input_bytes[self.position] == b'\n'
2145                    {
2146                        self.position += 1;
2147                    }
2148
2149                    // Switch back to normal mode
2150                    self.mode = LexerMode::ExpectTerm;
2151
2152                    return Some(Token {
2153                        token_type: TokenType::FormatBody(Arc::from(body.clone())),
2154                        text: Arc::from(body),
2155                        start,
2156                        end: self.position,
2157                    });
2158                }
2159            }
2160
2161            // Not a terminator, consume the character
2162            match self.current_char() {
2163                Some(ch) => {
2164                    body.push(ch);
2165                    self.advance();
2166
2167                    // Track if we're at the start of a line
2168                    line_start = ch == '\n';
2169                }
2170                None => {
2171                    // Reached EOF without finding terminator
2172                    break;
2173                }
2174            }
2175        }
2176
2177        // If we reach here, we didn't find a terminator
2178        self.mode = LexerMode::ExpectTerm;
2179        Some(Token {
2180            token_type: TokenType::Error(Arc::from("Unterminated format body")),
2181            text: Arc::from(body),
2182            start,
2183            end: self.position,
2184        })
2185    }
2186
2187    fn try_operator(&mut self) -> Option<Token> {
2188        // Skip operator parsing if we're expecting a delimiter for a quote operator
2189        if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2190            return None;
2191        }
2192
2193        let start = self.position;
2194        let ch = self.current_char()?;
2195
2196        // ═══════════════════════════════════════════════════════════════════════
2197        // SLASH DISAMBIGUATION STRATEGY (Issue #422)
2198        // ═══════════════════════════════════════════════════════════════════════
2199        //
2200        // Perl's `/` character is ambiguous:
2201        //   - Division operator: `$x / 2`
2202        //   - Regex delimiter: `/pattern/`
2203        //   - Defined-or operator: `$x // $y`
2204        //
2205        // **Disambiguation Strategy (Context-Aware Heuristics):**
2206        //
2207        // 1. **Mode-Based Decision (Primary)**:
2208        //    - `LexerMode::ExpectTerm` → `/` starts a regex
2209        //      Examples: `if (/pattern/)`, `=~ /test/`, `( /regex/`
2210        //    - `LexerMode::ExpectOperator` → `/` is division or `//`
2211        //      Examples: `$x / 2`, `$x // $y`, `) / 3`
2212        //
2213        // 2. **Context Heuristics (Secondary - Implicit in Mode)**:
2214        //    Mode is set based on previous token:
2215        //    - After identifier/number/closing paren → ExpectOperator → division
2216        //    - After operator/keyword/opening paren → ExpectTerm → regex
2217        //
2218        // 3. **Budget Protection**:
2219        //    - Regex parsing has a parse-step budget and byte budget
2220        //    - Budget exceeded → emit UnknownRest token (graceful degradation)
2221        //    - See `parse_regex()` and `budget_guard()` for implementation
2222        //
2223        // 4. **Performance Characteristics**:
2224        //    - Single-pass: O(1) decision based on mode flag
2225        //    - No backtracking: Mode updated after each token
2226        //    - Optimized: Byte-level operations for common cases
2227        //
2228        // **Metrics & Monitoring**:
2229        //    - Budget exceeded events tracked via UnknownRest token emission
2230        //    - LSP diagnostics generated for truncated regexes
2231        //    - Test coverage: lexer_slash_timeout_tests.rs (21 test cases)
2232        //
2233        // ═══════════════════════════════════════════════════════════════════════
2234
2235        if ch == '/' {
2236            if self.mode == LexerMode::ExpectTerm {
2237                // Mode indicates we're expecting a term → `/` starts a regex
2238                // Examples: `if (/pattern/)`, `=~ /test/`, `while (/match/)`
2239                return self.parse_regex(start);
2240            } else {
2241                // Mode indicates we're expecting an operator → `/` is division or `//`
2242                // Examples: `$x / 2`, `$x // $y`, `10 / 3`
2243                self.advance();
2244                // Check for // or //= using byte-level operations for speed
2245                if self.peek_byte(0) == Some(b'/') {
2246                    self.position += 1; // consume second / directly
2247                    if self.peek_byte(0) == Some(b'=') {
2248                        self.position += 1; // consume = directly
2249                        let text = &self.input[start..self.position];
2250                        self.mode = LexerMode::ExpectTerm;
2251                        return Some(Token {
2252                            token_type: TokenType::Operator(Arc::from(text)),
2253                            text: Arc::from(text),
2254                            start,
2255                            end: self.position,
2256                        });
2257                    } else {
2258                        // Use cached string for common "//" operator
2259                        self.mode = LexerMode::ExpectTerm;
2260                        return Some(Token {
2261                            token_type: TokenType::Operator(Arc::from("//")),
2262                            text: Arc::from("//"),
2263                            start,
2264                            end: self.position,
2265                        });
2266                    }
2267                } else if self.position < self.input_bytes.len()
2268                    && self.input_bytes[self.position] == b'='
2269                {
2270                    // /= division-assign operator
2271                    self.position += 1; // consume =
2272                    self.mode = LexerMode::ExpectTerm;
2273                    return Some(Token {
2274                        token_type: TokenType::Operator(Arc::from("/=")),
2275                        text: Arc::from("/="),
2276                        start,
2277                        end: self.position,
2278                    });
2279                } else {
2280                    // Use cached string for common "/" division
2281                    self.mode = LexerMode::ExpectTerm;
2282                    return Some(Token {
2283                        token_type: TokenType::Division,
2284                        text: Arc::from("/"),
2285                        start,
2286                        end: self.position,
2287                    });
2288                }
2289            }
2290        }
2291
2292        // Handle other operators - simplified
2293        match ch {
2294            '.' => {
2295                // Check if it's a decimal number like .5 -- but only when we
2296                // expect a term.  In operator position `.5` is concatenation
2297                // of the bareword/number on the left with the number `5`.
2298                if self.mode != LexerMode::ExpectOperator
2299                    && self.peek_char(1).is_some_and(|c| c.is_ascii_digit())
2300                {
2301                    return self.parse_decimal_number(start);
2302                }
2303                self.advance();
2304                // Check for compound operators
2305                #[allow(clippy::collapsible_if)]
2306                if let Some(next) = self.current_char() {
2307                    if is_compound_operator(ch, next) {
2308                        self.advance();
2309
2310                        // Check for three-character operators like **=, <<=, >>=
2311                        if self.position < self.input.len() {
2312                            let third = self.current_char();
2313                            // Check for three-character operators
2314                            if matches!(
2315                                (ch, next, third),
2316                                ('*', '*', Some('='))
2317                                    | ('<', '<', Some('='))
2318                                    | ('>', '>', Some('='))
2319                                    | ('&', '&', Some('='))
2320                                    | ('|', '|', Some('='))
2321                                    | ('/', '/', Some('='))
2322                            ) {
2323                                self.advance(); // consume the =
2324                            } else if ch == '<' && next == '=' && third == Some('>') {
2325                                self.advance(); // consume the >
2326                            // Special case: <=> spaceship operator
2327                            } else if ch == '.' && next == '.' && third == Some('.') {
2328                                self.advance(); // consume the third .
2329                            }
2330                        }
2331                    }
2332                }
2333            }
2334            '+' | '-' | '*' | '%' | '&' | '|' | '^' | '~' | '!' | '=' | '<' | '>' | ':' | '?'
2335            | '\\' => {
2336                self.advance();
2337                // Check for compound operators
2338                #[allow(clippy::collapsible_if)]
2339                if let Some(next) = self.current_char() {
2340                    if is_compound_operator(ch, next) {
2341                        self.advance();
2342
2343                        // Check for three-character operators like **=, <<=, >>=
2344                        if self.position < self.input.len() {
2345                            let third = self.current_char();
2346                            // Check for three-character operators
2347                            if matches!(
2348                                (ch, next, third),
2349                                ('*', '*', Some('='))
2350                                    | ('<', '<', Some('='))
2351                                    | ('>', '>', Some('='))
2352                                    | ('&', '&', Some('='))
2353                                    | ('|', '|', Some('='))
2354                                    | ('/', '/', Some('='))
2355                            ) {
2356                                self.advance(); // consume the =
2357                            } else if ch == '<' && next == '=' && third == Some('>') {
2358                                self.advance(); // consume the >
2359                                // Special case: <=> spaceship operator
2360                            }
2361                        }
2362                    }
2363                }
2364            }
2365            _ => return None,
2366        }
2367
2368        let text = &self.input[start..self.position];
2369        // Operator ends prototype window (e.g. `:` for attributes)
2370        self.after_sub = false;
2371        // Track whether this operator is '->' for method name disambiguation
2372        self.after_arrow = text == "->";
2373        // Any operator token ends the "just saw a variable" window; `{` after
2374        // an operator is not a hash subscript (e.g. `foo() {`, `+ {`, etc.).
2375        self.after_var_subscript = false;
2376        // Postfix ++ and -- complete a term expression, so next token is an operator
2377        // (e.g., "$x++ / 2" → / is division, not regex)
2378        if (text == "++" || text == "--") && self.mode == LexerMode::ExpectOperator {
2379            // Postfix: stay in ExpectOperator
2380        } else {
2381            self.mode = LexerMode::ExpectTerm;
2382        }
2383
2384        Some(Token {
2385            token_type: TokenType::Operator(Arc::from(text)),
2386            text: Arc::from(text),
2387            start,
2388            end: self.position,
2389        })
2390    }
2391
2392    fn try_delimiter(&mut self) -> Option<Token> {
2393        let start = self.position;
2394        let ch = self.current_char()?;
2395
2396        // If we're expecting a delimiter for a quote operator, handle it specially
2397        if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2398            // Accept any non-alphanumeric character as a delimiter
2399            if !ch.is_alphanumeric() && !ch.is_whitespace() {
2400                self.advance();
2401                if let Some(ref mut info) = self.current_quote_op {
2402                    info.delimiter = ch;
2403                }
2404                // Now parse the quote operator content
2405                return self.parse_quote_operator(ch);
2406            }
2407        }
2408
2409        match ch {
2410            '(' => {
2411                // Check if this is a quote operator delimiter
2412                if matches!(self.mode, LexerMode::ExpectDelimiter)
2413                    && self.current_quote_op.is_some()
2414                {
2415                    self.advance();
2416                    if let Some(ref mut info) = self.current_quote_op {
2417                        info.delimiter = ch;
2418                    }
2419                    return self.parse_quote_operator(ch);
2420                }
2421
2422                self.advance();
2423                if self.after_sub {
2424                    // Promote after_sub to in_prototype now that we see '('
2425                    self.in_prototype = true;
2426                    self.after_sub = false;
2427                    self.prototype_depth = 1;
2428                } else if self.in_prototype {
2429                    self.prototype_depth += 1;
2430                }
2431                self.paren_depth += 1;
2432                self.after_var_subscript = false;
2433                self.mode = LexerMode::ExpectTerm;
2434                Some(Token {
2435                    token_type: TokenType::LeftParen,
2436                    text: Arc::from("("),
2437                    start,
2438                    end: self.position,
2439                })
2440            }
2441            ')' => {
2442                self.advance();
2443                if self.in_prototype && self.prototype_depth > 0 {
2444                    self.prototype_depth -= 1;
2445                    if self.prototype_depth == 0 {
2446                        self.in_prototype = false;
2447                    }
2448                }
2449                self.after_arrow = false;
2450                self.paren_depth = self.paren_depth.saturating_sub(1);
2451                // A closing paren ends any var-subscript context: `if ($var)` should
2452                // NOT leave after_var_subscript set, otherwise the following `{` would
2453                // incorrectly increment hash_brace_depth and suppress regex operators
2454                // inside the block body (issue #2844).
2455                self.after_var_subscript = false;
2456                self.mode = LexerMode::ExpectOperator;
2457                Some(Token {
2458                    token_type: TokenType::RightParen,
2459                    text: Arc::from(")"),
2460                    start,
2461                    end: self.position,
2462                })
2463            }
2464            ';' => {
2465                self.advance();
2466                // Semicolon ends prototype window (forward declaration)
2467                self.after_sub = false;
2468                // Semicolon is a statement boundary — any pending method-call chain is over.
2469                self.after_arrow = false;
2470                self.after_var_subscript = false;
2471                self.mode = LexerMode::ExpectTerm;
2472                Some(Token {
2473                    token_type: TokenType::Semicolon,
2474                    text: Arc::from(";"),
2475                    start,
2476                    end: self.position,
2477                })
2478            }
2479            ',' => {
2480                self.advance();
2481                self.after_var_subscript = false;
2482                self.mode = LexerMode::ExpectTerm;
2483                Some(Token {
2484                    token_type: TokenType::Comma,
2485                    text: Arc::from(","),
2486                    start,
2487                    end: self.position,
2488                })
2489            }
2490            '[' => {
2491                self.advance();
2492                self.after_var_subscript = false;
2493                self.mode = LexerMode::ExpectTerm;
2494                Some(Token {
2495                    token_type: TokenType::LeftBracket,
2496                    text: Arc::from("["),
2497                    start,
2498                    end: self.position,
2499                })
2500            }
2501            ']' => {
2502                self.advance();
2503                // A closing `]` from an array subscript leaves us in a state where
2504                // a `{` immediately following is a hash subscript — e.g. `$arr[$i]{key}`.
2505                // Set after_var_subscript so the `{` handler recognises it as such.
2506                // This mirrors the `}` handler's behavior when closing a hash subscript.
2507                self.after_var_subscript = true;
2508                self.mode = LexerMode::ExpectOperator;
2509                Some(Token {
2510                    token_type: TokenType::RightBracket,
2511                    text: Arc::from("]"),
2512                    start,
2513                    end: self.position,
2514                })
2515            }
2516            '{' => {
2517                self.advance();
2518                // Opening brace ends prototype window — no prototype follows
2519                self.after_sub = false;
2520                // `{` is a hash/slice subscript opener only when it immediately follows
2521                // a variable token ($x, @x, %x) — tracked by `after_var_subscript`.
2522                // This is narrower than the old `mode == ExpectOperator` check, which
2523                // incorrectly incremented depth for block-opening braces after `sub foo`,
2524                // `if (cond)`, `else`, `while (cond)`, etc., causing quote-op suppression
2525                // inside those block bodies and breaking m//, s///, qr//, tr/// etc.
2526                if self.after_var_subscript {
2527                    self.hash_brace_depth = self.hash_brace_depth.saturating_add(1);
2528                }
2529                self.after_var_subscript = false;
2530                self.mode = LexerMode::ExpectTerm;
2531                Some(Token {
2532                    token_type: TokenType::LeftBrace,
2533                    text: Arc::from("{"),
2534                    start,
2535                    end: self.position,
2536                })
2537            }
2538            '}' => {
2539                self.advance();
2540                self.after_arrow = false;
2541                // Decrement hash subscript brace depth only if we were inside one.
2542                // If depth > 0, this closes a hash subscript; enable chained subscripts
2543                // like $h{a}{b} by setting after_var_subscript so the next `{` is
2544                // recognized as another subscript opener.
2545                if self.hash_brace_depth > 0 {
2546                    self.hash_brace_depth -= 1;
2547                    // The subscript value is now the "variable" for a chained subscript.
2548                    self.after_var_subscript = true;
2549                } else {
2550                    // Block-close `}` — no subscript follows
2551                    self.after_var_subscript = false;
2552                }
2553                self.mode = LexerMode::ExpectOperator;
2554                Some(Token {
2555                    token_type: TokenType::RightBrace,
2556                    text: Arc::from("}"),
2557                    start,
2558                    end: self.position,
2559                })
2560            }
2561            '#' => {
2562                // Only treat as delimiter in ExpectDelimiter mode
2563                if matches!(self.mode, LexerMode::ExpectDelimiter) {
2564                    self.advance();
2565                    // Reset mode after consuming delimiter
2566                    self.mode = LexerMode::ExpectTerm;
2567                    Some(Token {
2568                        token_type: TokenType::Operator(Arc::from("#")),
2569                        text: Arc::from("#"),
2570                        start,
2571                        end: self.position,
2572                    })
2573                } else {
2574                    None
2575                }
2576            }
2577            _ => None,
2578        }
2579    }
2580
2581    fn parse_double_quoted_string(&mut self, start: usize) -> Option<Token> {
2582        self.advance(); // Skip opening quote
2583        let mut parts = Vec::new();
2584        let mut current_literal = String::new();
2585        let mut last_pos = self.position;
2586
2587        while let Some(ch) = self.current_char() {
2588            match ch {
2589                '"' => {
2590                    self.advance();
2591                    if !current_literal.is_empty() {
2592                        parts.push(StringPart::Literal(Arc::from(current_literal)));
2593                    }
2594
2595                    let text = &self.input[start..self.position];
2596                    self.mode = LexerMode::ExpectOperator;
2597
2598                    return Some(Token {
2599                        token_type: if parts.is_empty() {
2600                            TokenType::StringLiteral
2601                        } else {
2602                            TokenType::InterpolatedString(parts)
2603                        },
2604                        text: Arc::from(text),
2605                        start,
2606                        end: self.position,
2607                    });
2608                }
2609                '\\' => {
2610                    self.advance();
2611                    if let Some(escaped) = self.current_char() {
2612                        // Optimize by reserving space to avoid frequent reallocations
2613                        if current_literal.capacity() == 0 {
2614                            current_literal.reserve(32);
2615                        }
2616                        current_literal.push('\\');
2617                        current_literal.push(escaped);
2618                        self.advance();
2619                    }
2620                }
2621                '$' if self.config.parse_interpolation => {
2622                    // Handle variable interpolation - avoid unnecessary clone
2623                    if !current_literal.is_empty() {
2624                        parts.push(StringPart::Literal(Arc::from(current_literal)));
2625                        current_literal = String::new(); // Clear without cloning
2626                    }
2627
2628                    let part_start = self.position;
2629                    self.advance();
2630                    match self.current_char() {
2631                        Some('{') => {
2632                            let _ = self.consume_balanced_segment_in_string('{', '}', '"');
2633                            parts.push(StringPart::Expression(Arc::from(
2634                                &self.input[part_start..self.position],
2635                            )));
2636                        }
2637                        Some(ch) if is_perl_identifier_start(ch) => {
2638                            let var_start = self.position;
2639
2640                            // Fast path for ASCII identifier continuation
2641                            while self.position < self.input_bytes.len() {
2642                                let byte = self.input_bytes[self.position];
2643                                if byte.is_ascii_alphanumeric() || byte == b'_' {
2644                                    self.position += 1;
2645                                } else if byte >= 128 {
2646                                    // Only use UTF-8 parsing for non-ASCII
2647                                    if let Some(ch) = self.current_char() {
2648                                        if is_perl_identifier_continue(ch) {
2649                                            self.advance();
2650                                        } else {
2651                                            break;
2652                                        }
2653                                    } else {
2654                                        break;
2655                                    }
2656                                } else {
2657                                    break;
2658                                }
2659                            }
2660
2661                            if self.position > var_start {
2662                                let var_name = &self.input[part_start..self.position];
2663                                parts.push(StringPart::Variable(Arc::from(var_name)));
2664
2665                                if self.matches_bytes(b"->") {
2666                                    let tail_start = self.position;
2667                                    self.advance();
2668                                    self.advance();
2669
2670                                    match self.current_char() {
2671                                        Some('[') => {
2672                                            let _ = self
2673                                                .consume_balanced_segment_in_string('[', ']', '"');
2674                                            parts.push(StringPart::MethodCall(Arc::from(
2675                                                &self.input[tail_start..self.position],
2676                                            )));
2677                                        }
2678                                        Some('{') => {
2679                                            let _ = self
2680                                                .consume_balanced_segment_in_string('{', '}', '"');
2681                                            parts.push(StringPart::MethodCall(Arc::from(
2682                                                &self.input[tail_start..self.position],
2683                                            )));
2684                                        }
2685                                        Some('(') => {
2686                                            let _ = self
2687                                                .consume_balanced_segment_in_string('(', ')', '"');
2688                                            parts.push(StringPart::MethodCall(Arc::from(
2689                                                &self.input[tail_start..self.position],
2690                                            )));
2691                                        }
2692                                        Some(ch) if is_perl_identifier_start(ch) => {
2693                                            while self.position < self.input_bytes.len() {
2694                                                let byte = self.input_bytes[self.position];
2695                                                if byte.is_ascii_alphanumeric() || byte == b'_' {
2696                                                    self.position += 1;
2697                                                } else if byte >= 128 {
2698                                                    if let Some(ch) = self.current_char() {
2699                                                        if is_perl_identifier_continue(ch) {
2700                                                            self.advance();
2701                                                        } else {
2702                                                            break;
2703                                                        }
2704                                                    } else {
2705                                                        break;
2706                                                    }
2707                                                } else {
2708                                                    break;
2709                                                }
2710                                            }
2711                                            if self.current_char() == Some('(') {
2712                                                let _ = self.consume_balanced_segment_in_string(
2713                                                    '(', ')', '"',
2714                                                );
2715                                            }
2716                                            parts.push(StringPart::MethodCall(Arc::from(
2717                                                &self.input[tail_start..self.position],
2718                                            )));
2719                                        }
2720                                        _ => {
2721                                            parts.push(StringPart::MethodCall(Arc::from(
2722                                                &self.input[tail_start..self.position],
2723                                            )));
2724                                        }
2725                                    }
2726                                } else if self.current_char() == Some('[') {
2727                                    let tail_start = self.position;
2728                                    let _ = self.consume_balanced_segment_in_string('[', ']', '"');
2729                                    parts.push(StringPart::ArraySlice(Arc::from(
2730                                        &self.input[tail_start..self.position],
2731                                    )));
2732                                } else if self.current_char() == Some('{') {
2733                                    let tail_start = self.position;
2734                                    let _ = self.consume_balanced_segment_in_string('{', '}', '"');
2735                                    parts.push(StringPart::Expression(Arc::from(
2736                                        &self.input[tail_start..self.position],
2737                                    )));
2738                                }
2739                            }
2740                        }
2741                        _ => {}
2742                    }
2743                }
2744                _ => {
2745                    // Optimize string building with better capacity management
2746                    if current_literal.capacity() == 0 {
2747                        current_literal.reserve(32);
2748                    }
2749                    current_literal.push(ch);
2750                    self.advance();
2751                }
2752            }
2753
2754            // Safety check: ensure we're making progress
2755            if self.position == last_pos {
2756                break;
2757            }
2758            last_pos = self.position;
2759        }
2760
2761        Some(self.unterminated_string_error(start))
2762    }
2763
2764    fn parse_single_quoted_string(&mut self, start: usize) -> Option<Token> {
2765        self.advance(); // Skip opening quote
2766
2767        let mut last_pos = self.position;
2768
2769        while let Some(ch) = self.current_char() {
2770            match ch {
2771                '\'' => {
2772                    self.advance();
2773                    let text = &self.input[start..self.position];
2774                    self.mode = LexerMode::ExpectOperator;
2775
2776                    return Some(Token {
2777                        token_type: TokenType::StringLiteral,
2778                        text: Arc::from(text),
2779                        start,
2780                        end: self.position,
2781                    });
2782                }
2783                '\\' => {
2784                    self.advance();
2785                    if self.current_char() == Some('\'') || self.current_char() == Some('\\') {
2786                        self.advance();
2787                    }
2788                }
2789                _ => self.advance(),
2790            }
2791
2792            // Safety check: ensure we're making progress
2793            if self.position == last_pos {
2794                break;
2795            }
2796            last_pos = self.position;
2797        }
2798
2799        Some(self.unterminated_string_error(start))
2800    }
2801
2802    fn parse_backtick_string(&mut self, start: usize) -> Option<Token> {
2803        self.advance(); // Skip opening backtick
2804
2805        let mut last_pos = self.position;
2806
2807        while let Some(ch) = self.current_char() {
2808            match ch {
2809                '`' => {
2810                    self.advance();
2811                    let text = &self.input[start..self.position];
2812                    self.mode = LexerMode::ExpectOperator;
2813
2814                    return Some(Token {
2815                        token_type: TokenType::QuoteCommand,
2816                        text: Arc::from(text),
2817                        start,
2818                        end: self.position,
2819                    });
2820                }
2821                '\\' => {
2822                    self.advance();
2823                    if self.current_char().is_some() {
2824                        self.advance();
2825                    }
2826                }
2827                _ => self.advance(),
2828            }
2829
2830            // Safety check: ensure we're making progress
2831            if self.position == last_pos {
2832                break;
2833            }
2834            last_pos = self.position;
2835        }
2836
2837        Some(self.unterminated_string_error(start))
2838    }
2839
2840    fn parse_q_string(&mut self, _start: usize) -> Option<Token> {
2841        // Simplified q-string parsing
2842        None
2843    }
2844
2845    #[inline]
2846    fn unterminated_string_error(&mut self, start: usize) -> Token {
2847        // Consume to EOF so the caller receives a single terminal error token.
2848        let end = self.input.len();
2849        self.position = end;
2850
2851        Token {
2852            token_type: TokenType::Error(Arc::from("unterminated string")),
2853            text: Arc::from(&self.input[start..end]),
2854            start,
2855            end,
2856        }
2857    }
2858
2859    fn parse_substitution(&mut self, start: usize) -> Option<Token> {
2860        // We've already consumed 's'
2861        let delimiter = self.current_char()?;
2862        self.advance(); // Skip delimiter
2863        self.parse_substitution_with_delimiter(start, delimiter)
2864    }
2865
2866    fn parse_substitution_with_delimiter(
2867        &mut self,
2868        start: usize,
2869        delimiter: char,
2870    ) -> Option<Token> {
2871        self.read_delimited_body(delimiter);
2872
2873        let pattern_is_paired = quote_handler::paired_close(delimiter).is_some();
2874        if pattern_is_paired {
2875            while self.current_char().is_some_and(char::is_whitespace) {
2876                self.advance();
2877            }
2878
2879            if let Some(repl_delim) = self.current_char()
2880                && Self::is_quote_delim(repl_delim)
2881            {
2882                self.advance();
2883                self.read_delimited_body(repl_delim);
2884            }
2885        } else {
2886            self.read_delimited_body(delimiter);
2887        }
2888
2889        // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
2890        while let Some(ch) = self.current_char() {
2891            if ch.is_ascii_alphanumeric() {
2892                self.advance();
2893            } else {
2894                break;
2895            }
2896        }
2897
2898        let text = &self.input[start..self.position];
2899        self.mode = LexerMode::ExpectOperator;
2900
2901        Some(Token {
2902            token_type: TokenType::Substitution,
2903            text: Arc::from(text),
2904            start,
2905            end: self.position,
2906        })
2907    }
2908
2909    fn parse_transliteration(&mut self, start: usize) -> Option<Token> {
2910        // We've already consumed 'tr' or 'y'
2911        while self.current_char().is_some_and(char::is_whitespace) {
2912            self.advance();
2913        }
2914
2915        let delimiter = self.current_char()?;
2916        self.advance(); // Skip delimiter
2917        self.parse_transliteration_with_delimiter(start, delimiter)
2918    }
2919
2920    fn parse_transliteration_with_delimiter(
2921        &mut self,
2922        start: usize,
2923        delimiter: char,
2924    ) -> Option<Token> {
2925        self.read_delimited_body(delimiter);
2926
2927        let search_is_paired = quote_handler::paired_close(delimiter).is_some();
2928        if search_is_paired {
2929            while self.current_char().is_some_and(char::is_whitespace) {
2930                self.advance();
2931            }
2932
2933            if let Some(repl_delim) = self.current_char()
2934                && Self::is_quote_delim(repl_delim)
2935            {
2936                self.advance();
2937                self.read_delimited_body(repl_delim);
2938            }
2939        } else {
2940            self.read_delimited_body(delimiter);
2941        }
2942
2943        // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
2944        while let Some(ch) = self.current_char() {
2945            if ch.is_ascii_alphanumeric() {
2946                self.advance();
2947            } else {
2948                break;
2949            }
2950        }
2951
2952        let text = &self.input[start..self.position];
2953        self.mode = LexerMode::ExpectOperator;
2954
2955        Some(Token {
2956            token_type: TokenType::Transliteration,
2957            text: Arc::from(text),
2958            start,
2959            end: self.position,
2960        })
2961    }
2962
2963    /// Read content between delimiters.
2964    ///
2965    /// Returns `(body, closed)` where `closed` is `true` if the closing
2966    /// delimiter was found before EOF, and `false` if EOF was reached first.
2967    fn read_delimited_body(&mut self, delim: char) -> (String, bool) {
2968        let paired = quote_handler::paired_close(delim);
2969        let close = paired.unwrap_or(delim);
2970        let mut body = String::new();
2971        let mut depth = i32::from(paired.is_some());
2972
2973        while let Some(ch) = self.current_char() {
2974            if ch == '\\' {
2975                body.push(ch);
2976                self.advance();
2977                if let Some(next) = self.current_char() {
2978                    body.push(next);
2979                    self.advance();
2980                }
2981                continue;
2982            }
2983
2984            if paired.is_some() && ch == delim {
2985                body.push(ch);
2986                self.advance();
2987                depth += 1;
2988                continue;
2989            }
2990
2991            if ch == close {
2992                if paired.is_some() {
2993                    depth -= 1;
2994                    if depth == 0 {
2995                        self.advance();
2996                        return (body, true);
2997                    }
2998                    body.push(ch);
2999                    self.advance();
3000                } else {
3001                    self.advance();
3002                    return (body, true);
3003                }
3004                continue;
3005            }
3006
3007            body.push(ch);
3008            self.advance();
3009        }
3010
3011        // EOF reached without finding the closing delimiter
3012        (body, false)
3013    }
3014
3015    /// Parse a quote operator after we've seen the delimiter
3016    fn parse_quote_operator(&mut self, delimiter: char) -> Option<Token> {
3017        let info = self.current_quote_op.as_ref()?;
3018        let start = info.start_pos;
3019        let operator = info.operator.clone();
3020
3021        // Clear the quote-op context eagerly so any early-return path (s/tr/y delegations
3022        // below) does not leave a stale reference behind. The post-match cleanup at the
3023        // bottom of this function would otherwise be skipped for those operators.
3024        self.current_quote_op = None;
3025
3026        // Parse based on operator type; track whether all delimiters were closed.
3027        let closed = match operator.as_str() {
3028            "s" => {
3029                return self.parse_substitution_with_delimiter(start, delimiter);
3030            }
3031            "tr" | "y" => {
3032                return self.parse_transliteration_with_delimiter(start, delimiter);
3033            }
3034            "qr" => {
3035                let (_pattern, body_closed) = self.read_delimited_body(delimiter);
3036                self.parse_regex_modifiers(&quote_handler::QR_SPEC);
3037                body_closed
3038            }
3039            "m" => {
3040                let (_pattern, body_closed) = self.read_delimited_body(delimiter);
3041                self.parse_regex_modifiers(&quote_handler::M_SPEC);
3042                body_closed
3043            }
3044            _ => {
3045                // q, qq, qw, qx - no modifiers
3046                let (_body, body_closed) = self.read_delimited_body(delimiter);
3047                body_closed
3048            }
3049        };
3050
3051        let text = &self.input[start..self.position];
3052
3053        self.mode = LexerMode::ExpectOperator;
3054
3055        if !closed {
3056            // EOF reached before finding the closing delimiter — emit an error
3057            // token so the parser's recovery mechanism records a diagnostic.
3058            return Some(Token {
3059                token_type: TokenType::Error(Arc::from(format!(
3060                    "unclosed {} delimiter '{}'",
3061                    operator, delimiter
3062                ))),
3063                text: Arc::from(text),
3064                start,
3065                end: self.position,
3066            });
3067        }
3068
3069        let token_type = quote_handler::get_quote_token_type(&operator);
3070        Some(Token { token_type, text: Arc::from(text), start, end: self.position })
3071    }
3072
3073    /// Parse regex modifiers according to the given spec
3074    ///
3075    /// This function includes ALL characters that could be intended as modifiers,
3076    /// including invalid ones. This allows the parser to properly reject invalid
3077    /// modifiers with a clear error message, rather than leaving them as separate
3078    /// tokens that could be confusingly parsed.
3079    fn parse_regex_modifiers(&mut self, _spec: &quote_handler::ModSpec) {
3080        // Consume all alphanumeric characters that could be intended as modifiers
3081        // The parser will validate and reject invalid ones
3082        while let Some(ch) = self.current_char() {
3083            if ch.is_ascii_alphanumeric() {
3084                self.advance();
3085            } else {
3086                break;
3087            }
3088        }
3089        // Note: We no longer validate here - the parser will validate and provide
3090        // clear error messages for invalid modifiers (MUT_005 fix)
3091    }
3092
3093    /// Parse a regex literal starting with `/`
3094    ///
3095    /// **Budget Protection (Issue #422)**:
3096    /// - Budget guards prevent runaway scanning on pathological input
3097    /// - `MAX_REGEX_PARSE_STEPS` bounds literal scanning before the byte budget
3098    /// - `MAX_REGEX_BYTES` bounds total bytes consumed in a single regex literal
3099    /// - Graceful degradation: emit UnknownRest token if budget exceeded
3100    ///
3101    /// **Performance**:
3102    /// - Single-pass scanning with escape handling
3103    /// - Budget check per iteration (amortized O(1) via inline fast path)
3104    /// - Typical regex: <10μs, Large regex (64KB): ~1ms
3105    fn parse_regex(&mut self, start: usize) -> Option<Token> {
3106        self.advance(); // Skip opening /
3107
3108        let mut regex_parse_steps: usize = 0;
3109        let mut in_character_class = false;
3110
3111        while let Some(ch) = self.current_char() {
3112            regex_parse_steps += 1;
3113            if regex_parse_steps > MAX_REGEX_PARSE_STEPS {
3114                #[cfg(debug_assertions)]
3115                {
3116                    let text = &self.input[start..self.position];
3117                    let preview = truncate_preview(text, 50);
3118                    tracing::debug!(
3119                        limit = MAX_REGEX_PARSE_STEPS,
3120                        pattern_preview = %preview,
3121                        "Regex parse step budget exceeded"
3122                    );
3123                }
3124                self.position = self.input.len();
3125                return Some(Token {
3126                    token_type: TokenType::UnknownRest,
3127                    text: empty_arc(),
3128                    start,
3129                    end: self.position,
3130                });
3131            }
3132
3133            // Budget guard: prevent timeout on pathological input (Issue #422)
3134            // If exceeded, returns UnknownRest token for graceful degradation
3135            if let Some(token) = self.budget_guard(start, 0) {
3136                return Some(token);
3137            }
3138
3139            match ch {
3140                '/' if !in_character_class => {
3141                    self.advance();
3142                    // Parse flags - include all alphanumeric for proper validation in parser (MUT_005 fix)
3143                    while let Some(ch) = self.current_char() {
3144                        if ch.is_ascii_alphanumeric() {
3145                            self.advance();
3146                        } else {
3147                            break;
3148                        }
3149                    }
3150
3151                    let text = &self.input[start..self.position];
3152                    self.mode = LexerMode::ExpectOperator;
3153
3154                    return Some(Token {
3155                        token_type: TokenType::RegexMatch,
3156                        text: Arc::from(text),
3157                        start,
3158                        end: self.position,
3159                    });
3160                }
3161                '\\' => {
3162                    // Handle escape sequences: consume backslash + next char
3163                    self.advance();
3164                    if self.current_char().is_some() {
3165                        self.advance();
3166                    }
3167                }
3168                '[' => {
3169                    in_character_class = true;
3170                    self.advance();
3171                }
3172                ']' if in_character_class => {
3173                    in_character_class = false;
3174                    self.advance();
3175                }
3176                _ => self.advance(),
3177            }
3178        }
3179
3180        // Unterminated regex - EOF reached before closing /
3181        // Parser will emit diagnostic for unterminated literal
3182        None
3183    }
3184}
3185
3186// Pre-allocated empty Arc to avoid repeated allocations
3187static EMPTY_ARC: OnceLock<Arc<str>> = OnceLock::new();
3188
3189#[inline(always)]
3190fn empty_arc() -> Arc<str> {
3191    EMPTY_ARC.get_or_init(|| Arc::from("")).clone()
3192}
3193
3194fn truncate_preview(text: &str, max_chars: usize) -> String {
3195    match text.char_indices().nth(max_chars) {
3196        Some((idx, _)) => format!("{}...", &text[..idx]),
3197        None => text.to_string(),
3198    }
3199}
3200
3201#[inline(always)]
3202fn is_keyword_fast(word: &str) -> bool {
3203    // Fast length-based rejection for most cases.
3204    // Lexer keywords are currently bounded to 1..=9 characters.
3205    matches!(word.len(), 1..=9) && is_lexer_keyword(word)
3206}
3207
3208#[inline]
3209fn is_builtin_function(word: &str) -> bool {
3210    BARE_TERM_BUILTINS.binary_search(&word).is_ok()
3211}
3212
3213#[inline(always)]
3214fn is_quote_op_word_prefix(word: &[u8]) -> bool {
3215    matches!(word, b"m" | b"q" | b"qq" | b"qw" | b"qx" | b"qr")
3216}
3217
3218const BARE_TERM_BUILTINS: &[&str] = &[
3219    "abs", "chomp", "chop", "chr", "close", "defined", "delete", "each", "exists", "hex", "int",
3220    "join", "keys", "lc", "lcfirst", "length", "oct", "open", "ord", "pack", "print", "push",
3221    "read", "ref", "reverse", "rindex", "say", "scalar", "splice", "sprintf", "sqrt", "substr",
3222    "tie", "uc", "ucfirst", "unpack", "unshift", "untie", "values", "write",
3223];
3224
3225/// Fast lookup table for compound operator second characters
3226const COMPOUND_SECOND_CHARS: &[u8] = b"=<>&|+->.~*:";
3227
3228#[inline]
3229fn is_compound_operator(first: char, second: char) -> bool {
3230    // Optimized compound operator lookup using perfect hashing for common cases
3231    // Convert to bytes for faster comparison (most operators are ASCII)
3232    if first.is_ascii() && second.is_ascii() {
3233        let first_byte = first as u8;
3234        let second_byte = second as u8;
3235
3236        if !COMPOUND_SECOND_CHARS.contains(&second_byte) {
3237            return false;
3238        }
3239
3240        // Use lookup table approach for maximum performance
3241        match (first_byte, second_byte) {
3242            // Assignment operators
3243            (b'+' | b'-' | b'*' | b'/' | b'%' | b'&' | b'|' | b'^' | b'.', b'=') => true,
3244
3245            // Comparison operators
3246            (b'<' | b'>' | b'=' | b'!', b'=') => true,
3247
3248            // Pattern operators
3249            (b'=' | b'!', b'~') => true,
3250
3251            // Increment/decrement
3252            (b'+', b'+') | (b'-', b'-') => true,
3253
3254            // Logical operators
3255            (b'&', b'&') | (b'|', b'|') => true,
3256
3257            // Shift operators
3258            (b'<', b'<') | (b'>', b'>') => true,
3259
3260            // Other compound operators
3261            (b'*', b'*')
3262            | (b'/', b'/')
3263            | (b'-' | b'=', b'>')
3264            | (b'.', b'.')
3265            | (b'~', b'~')
3266            | (b':', b':') => true,
3267
3268            _ => false,
3269        }
3270    } else {
3271        // Fallback for non-ASCII (should be rare)
3272        matches!(
3273            (first, second),
3274            ('+' | '-' | '*' | '/' | '%' | '&' | '|' | '^' | '.' | '<' | '>' | '=' | '!', '=')
3275                | ('=' | '!' | '~', '~')
3276                | ('+', '+')
3277                | ('-', '-' | '>')
3278                | ('&', '&')
3279                | ('|', '|')
3280                | ('<', '<')
3281                | ('>' | '=', '>')
3282                | ('*', '*')
3283                | ('/', '/')
3284                | ('.', '.')
3285                | (':', ':')
3286        )
3287    }
3288}
3289
3290// Checkpoint support for incremental parsing
3291
3292mod checkpoint_impl;
3293
3294#[cfg(test)]
3295mod test_format_debug;
3296#[cfg(test)]
3297mod tests;
perl_lexer/lib.rs

perl_lexer/
lib.rs