perl_lexer/
lib.rs

1//! Context-aware Perl lexer with mode-based tokenization
2//!
3//! This crate provides a high-performance lexer for Perl that handles the inherently
4//! context-sensitive nature of the language. The lexer uses a mode-tracking system to
5//! correctly disambiguate ambiguous syntax like `/` (division vs. regex) and properly
6//! parse complex constructs like heredocs, quote-like operators, and nested delimiters.
7//!
8//! # Architecture
9//!
10//! The lexer is organized around several key concepts:
11//!
12//! - **Mode Tracking**: [`LexerMode`] tracks whether the parser expects a term or an operator,
13//!   enabling correct disambiguation of context-sensitive tokens.
14//! - **Checkpointing**: [`LexerCheckpoint`] and [`Checkpointable`] support incremental parsing
15//!   by allowing the lexer state to be saved and restored.
16//! - **Budget Limits**: Protection against pathological input with configurable size limits
17//!   for regex patterns, heredoc bodies, and delimiter nesting depth.
18//! - **Position Tracking**: [`Position`] maintains line/column information for error reporting
19//!   and LSP integration.
20//! - **Unicode Support**: Full Unicode identifier support following Perl 5.14+ semantics.
21//!
22//! # Usage
23//!
24//! ## Basic Tokenization
25//!
26//! ```rust
27//! use perl_lexer::{PerlLexer, TokenType};
28//!
29//! let mut lexer = PerlLexer::new("my $x = 42;");
30//! let tokens = lexer.collect_tokens();
31//!
32//! // First token is the keyword `my`
33//! assert!(matches!(&tokens[0].token_type, TokenType::Keyword(k) if &**k == "my"));
34//! // Tokens include variables, operators, literals, and EOF
35//! assert!(matches!(&tokens.last().map(|t| &t.token_type), Some(TokenType::EOF)));
36//! ```
37//!
38//! ## Context-Aware Parsing
39//!
40//! The lexer automatically tracks context to disambiguate operators:
41//!
42//! ```rust
43//! use perl_lexer::{PerlLexer, TokenType};
44//!
45//! // Division operator (after a term)
46//! let mut lexer = PerlLexer::new("42 / 2");
47//! // Regex operator (at start of expression)
48//! let mut lexer2 = PerlLexer::new("/pattern/");
49//! ```
50//!
51//! ## Checkpointing for Incremental Parsing
52//!
53//! ```rust,ignore
54//! use perl_lexer::{PerlLexer, Checkpointable};
55//!
56//! let mut lexer = PerlLexer::new("my $x = 1;");
57//! let checkpoint = lexer.checkpoint();
58//!
59//! // Parse some tokens
60//! let _ = lexer.next_token();
61//!
62//! // Restore to checkpoint
63//! lexer.restore(&checkpoint);
64//! ```
65//!
66//! ## Configuration Options
67//!
68//! ```rust
69//! use perl_lexer::{PerlLexer, LexerConfig};
70//!
71//! let config = LexerConfig {
72//!     parse_interpolation: true,  // Parse string interpolation
73//!     track_positions: true,      // Track line/column positions
74//!     max_lookahead: 1024,        // Maximum lookahead for disambiguation
75//! };
76//!
77//! let mut lexer = PerlLexer::with_config("my $x = 1;", config);
78//! ```
79//!
80//! # Context Sensitivity Examples
81//!
82//! Perl's grammar is highly context-sensitive. The lexer handles these cases:
83//!
84//! - **Division vs. Regex**: `/` is division after terms, regex at expression start
85//! - **Modulo vs. Hash Sigil**: `%` is modulo after terms, hash sigil at expression start
86//! - **Glob vs. Exponent**: `**` can be exponentiation or glob pattern start
87//! - **Defined-or vs. Regex**: `//` is defined-or after terms, regex at expression start
88//! - **Heredoc Markers**: `<<` can be left shift, here-doc, or numeric less-than-less-than
89//!
90//! # Budget Limits
91//!
92//! To prevent hangs on pathological input, the lexer enforces these limits:
93//!
94//! - **MAX_REGEX_BYTES**: 64KB maximum for regex patterns
95//! - **MAX_HEREDOC_BYTES**: 256KB maximum for heredoc bodies
96//! - **MAX_DELIM_NEST**: 128 levels maximum nesting depth for delimiters
97//! - **MAX_REGEX_PARSE_STEPS**: 32K maximum scan iterations for regex literals
98//!
99//! When limits are exceeded, the lexer emits an `UnknownRest` token preserving
100//! all previously parsed symbols, allowing continued analysis.
101//!
102//! # Integration with perl-parser
103//!
104//! The lexer is designed to work seamlessly with `perl_parser_core::Parser`.
105//! You rarely need to use the lexer directly -- the parser creates and manages
106//! a `PerlLexer` instance internally:
107//!
108//! ```rust,ignore
109//! use perl_parser_core::Parser;
110//!
111//! let code = r#"sub hello { print "Hello, world!\n"; }"#;
112//! let mut parser = Parser::new(code);
113//! let ast = parser.parse().expect("should parse");
114//! ```
115
116#![allow(
117    // Core allows for lexer code
118    clippy::too_many_lines,
119    clippy::module_name_repetitions,
120    clippy::cast_possible_truncation,
121    clippy::cast_sign_loss,
122    clippy::cast_possible_wrap,
123    clippy::cast_precision_loss,
124    clippy::must_use_candidate,
125    clippy::missing_errors_doc,
126    clippy::missing_panics_doc,
127
128    // Lexer-specific patterns that are fine
129    clippy::match_same_arms,
130    clippy::redundant_else,
131    clippy::unnecessary_wraps,
132    clippy::unused_self,
133    clippy::items_after_statements,
134    clippy::struct_excessive_bools,
135    clippy::uninlined_format_args
136)]
137
138use std::sync::Arc;
139
140pub mod api;
141pub mod builtins;
142pub mod checkpoint;
143pub mod config;
144pub mod error;
145mod heredoc;
146pub mod keywords;
147mod lexer;
148pub mod limits;
149pub mod mode;
150mod quote_handler;
151pub mod token;
152pub mod tokenizer;
153mod unicode;
154
155pub use api::*;
156pub use checkpoint::{CheckpointCache, Checkpointable, LexerCheckpoint};
157pub use config::LexerConfig;
158pub use error::{LexerError, Result};
159pub use lexer::PerlLexer;
160pub use limits::MAX_REGEX_PARSE_STEPS;
161pub use mode::LexerMode;
162pub use perl_position_tracking::Position;
163pub use token::{StringPart, Token, TokenType};
164
165use unicode::{is_perl_identifier_continue, is_perl_identifier_start};
166
167use crate::heredoc::HeredocSpec;
168use crate::lexer::helpers::{
169    empty_arc, is_builtin_function, is_compound_operator, is_keyword_fast, is_quote_op_word_prefix,
170    truncate_preview,
171};
172use crate::limits::{
173    HEREDOC_TIMEOUT_MS, MAX_DELIM_NEST, MAX_HEREDOC_BYTES, MAX_HEREDOC_DEPTH, MAX_REGEX_BYTES,
174};
175
176impl<'a> PerlLexer<'a> {
177    /// Create a new lexer that emits `HeredocBody` tokens (for LSP folding)
178    pub fn with_body_tokens(input: &'a str) -> Self {
179        let mut lexer = Self::new(input);
180        lexer.emit_heredoc_body_tokens = true;
181        lexer
182    }
183
184    /// Set the lexer mode (for resetting state at statement boundaries)
185    pub fn set_mode(&mut self, mode: LexerMode) {
186        self.mode = mode;
187    }
188
189    /// Advance the lexer and return the next token.
190    ///
191    /// Returns `None` only after an `EOF` token has already been emitted.
192    /// The final meaningful call returns `Some(Token { token_type: TokenType::EOF, .. })`.
193    pub fn next_token(&mut self) -> Option<Token> {
194        // Normalize file start (BOM) once
195        if self.position == 0 {
196            self.normalize_file_start();
197        }
198        self.normalize_char_boundary();
199
200        // Loop to avoid recursion when processing heredocs
201        loop {
202            // Handle format body parsing if we're in that mode
203            if matches!(self.mode, LexerMode::InFormatBody) {
204                return self.parse_format_body();
205            }
206
207            // Handle data section parsing if we're in that mode
208            if matches!(self.mode, LexerMode::InDataSection) {
209                return self.parse_data_body();
210            }
211
212            // Check if we're inside a heredoc body BEFORE skipping whitespace
213            let mut found_terminator = false;
214            if !self.pending_heredocs.is_empty() {
215                // Clone what we need to avoid holding a borrow
216                let (body_start, label, allow_indent) =
217                    if let Some(spec) = self.pending_heredocs.first() {
218                        if spec.body_start > 0
219                            && self.position >= spec.body_start
220                            && self.position < self.input.len()
221                        {
222                            (spec.body_start, spec.label.clone(), spec.allow_indent)
223                        } else {
224                            // Not in a heredoc body yet or at EOF
225                            (0, empty_arc(), false)
226                        }
227                    } else {
228                        (0, empty_arc(), false)
229                    };
230
231                if body_start > 0 {
232                    // We're inside a heredoc body - scan for the terminator
233
234                    // Scan line by line looking for the terminator
235                    while self.position < self.input.len() {
236                        // Timeout protection (Issue #443)
237                        if self.start_time.elapsed().as_millis() > HEREDOC_TIMEOUT_MS as u128 {
238                            self.pending_heredocs.remove(0);
239                            self.position = self.input.len();
240                            return Some(Token {
241                                token_type: TokenType::Error(Arc::from("Heredoc parsing timeout")),
242                                text: Arc::from(&self.input[body_start..]),
243                                start: body_start,
244                                end: self.input.len(),
245                            });
246                        }
247
248                        // Budget cap for huge bodies - optimized check
249                        if self.position - body_start > MAX_HEREDOC_BYTES {
250                            // Remove the pending heredoc to avoid infinite loop
251                            self.pending_heredocs.remove(0);
252                            self.position = self.input.len();
253                            return Some(Token {
254                                token_type: TokenType::UnknownRest,
255                                text: Arc::from(&self.input[body_start..]),
256                                start: body_start,
257                                end: self.input.len(),
258                            });
259                        }
260
261                        // Skip to start of next line if not at line start
262                        // Exception: if we're at body_start exactly, we're at the heredoc body start
263                        if !self.after_newline && self.position != body_start {
264                            while self.position < self.input.len()
265                                && self.input_bytes[self.position] != b'\n'
266                                && self.input_bytes[self.position] != b'\r'
267                            {
268                                self.advance();
269                            }
270                            self.consume_newline();
271                            continue;
272                        }
273
274                        // We're at line start - check if this line is the terminator
275                        let line_start = self.position;
276                        let line_end = Self::find_line_end(self.input_bytes, self.position);
277                        let line = &self.input[line_start..line_end];
278                        // Strip trailing spaces/tabs (Perl allows them)
279                        let trimmed_end = line.trim_end_matches([' ', '\t']);
280
281                        // Check if this line is the terminator
282                        let is_terminator = if allow_indent {
283                            // Allow any leading spaces/tabs before the label
284                            let mut p = 0;
285                            while p < trimmed_end.len() {
286                                let b = trimmed_end.as_bytes()[p];
287                                if b == b' ' || b == b'\t' {
288                                    p += 1;
289                                } else {
290                                    break;
291                                }
292                            }
293                            trimmed_end[p..] == *label
294                        } else {
295                            // Must start at column 0 (no leading whitespace)
296                            // The terminator is just the label (already trimmed trailing whitespace)
297                            trimmed_end == &*label
298                        };
299
300                        if is_terminator {
301                            // Found the terminator!
302                            self.pending_heredocs.remove(0);
303                            found_terminator = true;
304
305                            // Consume past the terminator line
306                            self.position = line_end;
307                            self.consume_newline();
308
309                            // Set body_start for the next pending heredoc (if any)
310                            if let Some(next) = self.pending_heredocs.first_mut()
311                                && next.body_start == 0
312                            {
313                                next.body_start = self.position;
314                            }
315
316                            // Only emit HeredocBody if requested (for folding)
317                            if self.emit_heredoc_body_tokens {
318                                return Some(Token {
319                                    token_type: TokenType::HeredocBody(empty_arc()),
320                                    text: empty_arc(),
321                                    start: body_start,
322                                    end: line_start,
323                                });
324                            }
325                            // Otherwise, continue the outer loop to get the next real token (avoiding recursion)
326                            break; // Break inner while loop, continue outer loop
327                        }
328
329                        // Not the terminator, continue to next line
330                        self.position = line_end;
331                        self.consume_newline();
332                    }
333
334                    // If we didn't find a terminator, we reached EOF - emit error token
335                    if !found_terminator {
336                        // Remove the pending heredoc to avoid infinite loop
337                        self.pending_heredocs.remove(0);
338                        self.position = self.input.len();
339                        return Some(Token {
340                            token_type: TokenType::UnknownRest,
341                            text: Arc::from(&self.input[body_start..]),
342                            start: body_start,
343                            end: self.input.len(),
344                        });
345                    }
346                }
347
348                // If we found a terminator, continue outer loop to get next token
349                if found_terminator {
350                    continue; // Continue outer loop to get next token
351                }
352            }
353
354            self.skip_whitespace_and_comments()?;
355
356            // Check again if we're now in a heredoc body (might have been set during skip_whitespace)
357            if !self.pending_heredocs.is_empty()
358                && let Some(spec) = self.pending_heredocs.first()
359                && spec.body_start > 0
360                && self.position >= spec.body_start
361                && self.position < self.input.len()
362            {
363                continue; // Go back to top of loop to process heredoc
364            }
365
366            // If we reach EOF with pending heredocs, clear them and emit EOF
367            if self.position >= self.input.len() && !self.pending_heredocs.is_empty() {
368                self.pending_heredocs.clear();
369            }
370
371            if self.position >= self.input.len() {
372                if self.eof_emitted {
373                    return None; // Stop the stream
374                }
375                self.eof_emitted = true;
376                return Some(Token {
377                    token_type: TokenType::EOF,
378                    text: empty_arc(),
379                    start: self.position,
380                    end: self.position,
381                });
382            }
383
384            let start = self.position;
385
386            // Check for special tokens first
387            if let Some(token) = self.try_heredoc() {
388                return Some(token);
389            }
390
391            if let Some(token) = self.try_string() {
392                return Some(token);
393            }
394
395            if let Some(token) = self.try_variable() {
396                return Some(token);
397            }
398
399            if let Some(token) = self.try_number() {
400                return Some(token);
401            }
402
403            if let Some(token) = self.try_vstring() {
404                return Some(token);
405            }
406
407            if let Some(token) = self.try_identifier_or_keyword() {
408                return Some(token);
409            }
410
411            // If we're expecting a delimiter for a quote operator, only try delimiter
412            if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
413                if let Some(token) = self.try_delimiter() {
414                    return Some(token);
415                }
416                // Do NOT fall through to try_operator / try_punct / etc.
417                // Clear state first so we don't spin
418                self.mode = LexerMode::ExpectOperator;
419                self.current_quote_op = None;
420                continue;
421            }
422
423            if let Some(token) = self.try_operator() {
424                return Some(token);
425            }
426
427            if let Some(token) = self.try_delimiter() {
428                return Some(token);
429            }
430
431            // If nothing else matches, return an error token
432            let ch = self.current_char()?;
433            self.advance();
434
435            // Optimize error token creation - avoid expensive formatting in hot path
436            let text = if ch.is_ascii() {
437                // Fast path for ASCII characters
438                Arc::from(&self.input[start..self.position])
439            } else {
440                // Unicode path without intermediate heap allocation
441                let mut buf = [0_u8; 4];
442                Arc::from(ch.encode_utf8(&mut buf))
443            };
444
445            return Some(Token {
446                token_type: TokenType::Error(Arc::from("Unexpected character")),
447                text,
448                start,
449                end: self.position,
450            });
451        } // End of loop
452    }
453
454    /// Budget guard to prevent infinite loops and timeouts (Issue #422)
455    ///
456    /// **Purpose**: Protect against pathological input that could cause:
457    /// - Infinite loops in regex/heredoc parsing
458    /// - Excessive memory consumption
459    /// - LSP server hangs
460    ///
461    /// **Limits**:
462    /// - `MAX_REGEX_BYTES` (64KB): Maximum bytes in a single regex literal
463    /// - `MAX_DELIM_NEST` (128): Maximum delimiter nesting depth
464    ///
465    /// **Graceful Degradation**:
466    /// - Budget exceeded → emit `UnknownRest` token
467    /// - Jump to EOF to prevent further parsing of problematic region
468    /// - LSP client can emit soft diagnostic about truncation
469    /// - All previously parsed symbols remain valid
470    ///
471    /// **Performance**:
472    /// - Fast path: inlined subtraction + comparison (~1-2 CPU cycles)
473    /// - Slow path: Only triggered on pathological input
474    /// - Amortized cost: O(1) per token
475    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
476    #[inline(always)]
477    fn budget_guard(&mut self, start: usize, depth: usize) -> Option<Token> {
478        // Fast path: most calls won't hit limits
479        let bytes_consumed = self.position - start;
480        if bytes_consumed <= MAX_REGEX_BYTES && depth <= MAX_DELIM_NEST {
481            return None;
482        }
483
484        // Slow path: budget exceeded - graceful degradation
485        #[cfg(debug_assertions)]
486        {
487            tracing::debug!(
488                bytes_consumed,
489                depth,
490                position = self.position,
491                "Lexer budget exceeded"
492            );
493        }
494
495        self.position = self.input.len();
496        Some(Token {
497            token_type: TokenType::UnknownRest,
498            text: Arc::from(""),
499            start,
500            end: self.position,
501        })
502    }
503
504    /// Peek at the next token without consuming it.
505    ///
506    /// Saves and restores the full lexer state so the next call to
507    /// [`next_token`](Self::next_token) returns the same token.
508    pub fn peek_token(&mut self) -> Option<Token> {
509        let saved_pos = self.position;
510        let saved_mode = self.mode;
511        let saved_delimiter_stack = self.delimiter_stack.clone();
512        let saved_prototype = self.in_prototype;
513        let saved_depth = self.prototype_depth;
514        let saved_after_sub = self.after_sub;
515        let saved_after_arrow = self.after_arrow;
516        let saved_hash_brace_depth = self.hash_brace_depth;
517        let saved_after_var_subscript = self.after_var_subscript;
518        let saved_paren_depth = self.paren_depth;
519        let saved_current_pos = self.current_pos;
520        let saved_after_newline = self.after_newline;
521        let saved_pending_heredocs = self.pending_heredocs.clone();
522        let saved_line_start_offset = self.line_start_offset;
523        let saved_current_quote_op = self.current_quote_op.clone();
524        let saved_eof_emitted = self.eof_emitted;
525        let saved_start_time = self.start_time;
526
527        let token = self.next_token();
528
529        self.position = saved_pos;
530        self.mode = saved_mode;
531        self.delimiter_stack = saved_delimiter_stack;
532        self.in_prototype = saved_prototype;
533        self.prototype_depth = saved_depth;
534        self.after_sub = saved_after_sub;
535        self.after_arrow = saved_after_arrow;
536        self.hash_brace_depth = saved_hash_brace_depth;
537        self.after_var_subscript = saved_after_var_subscript;
538        self.paren_depth = saved_paren_depth;
539        self.current_pos = saved_current_pos;
540        self.after_newline = saved_after_newline;
541        self.pending_heredocs = saved_pending_heredocs;
542        self.line_start_offset = saved_line_start_offset;
543        self.current_quote_op = saved_current_quote_op;
544        self.eof_emitted = saved_eof_emitted;
545        self.start_time = saved_start_time;
546
547        token
548    }
549
550    /// Consume all remaining tokens and return them as a vector.
551    ///
552    /// The returned vector always ends with an `EOF` token.
553    pub fn collect_tokens(&mut self) -> Vec<Token> {
554        let mut tokens = Vec::new();
555        while let Some(token) = self.next_token() {
556            if token.token_type == TokenType::EOF {
557                tokens.push(token);
558                break;
559            }
560            tokens.push(token);
561        }
562        tokens
563    }
564
565    /// Reset the lexer to the beginning of the input.
566    ///
567    /// Clears all internal state (mode, delimiter stack, heredoc queue, etc.)
568    /// so the lexer can re-tokenize the same source from scratch.
569    pub fn reset(&mut self) {
570        self.position = 0;
571        self.mode = LexerMode::ExpectTerm;
572        self.delimiter_stack.clear();
573        self.in_prototype = false;
574        self.prototype_depth = 0;
575        self.after_sub = false;
576        self.after_arrow = false;
577        self.hash_brace_depth = 0;
578        self.after_var_subscript = false;
579        self.paren_depth = 0;
580        self.current_pos = Position::start();
581        self.after_newline = true;
582        self.pending_heredocs.clear();
583        self.line_start_offset = 0;
584        self.current_quote_op = None;
585        self.eof_emitted = false;
586        self.start_time = std::time::Instant::now();
587    }
588
589    /// Switch the lexer into format-body parsing mode.
590    ///
591    /// In this mode the lexer consumes input verbatim until it encounters a
592    /// line containing only `.` (the Perl format terminator).
593    pub fn enter_format_mode(&mut self) {
594        self.mode = LexerMode::InFormatBody;
595    }
596
597    // Token-specific parsing methods
598
599    #[inline]
600    fn skip_whitespace_and_comments(&mut self) -> Option<()> {
601        // Don't reset after_newline if we're at the start of a line
602        if self.position > 0 && self.position != self.line_start_offset {
603            self.after_newline = false;
604        }
605
606        while self.position < self.input_bytes.len() {
607            let byte = Self::byte_at(self.input_bytes, self.position);
608            match byte {
609                // Fast path for ASCII whitespace - batch process
610                b' ' => {
611                    // Batch skip spaces for better cache efficiency
612                    let start = self.position;
613                    while self.position < self.input_bytes.len()
614                        && Self::byte_at(self.input_bytes, self.position) == b' '
615                    {
616                        self.position += 1;
617                    }
618                    // Continue outer loop if we processed any spaces
619                    if self.position > start {
620                        // Loop naturally continues to next iteration
621                    }
622                }
623                b'\t' | 0x0B | 0x0C => {
624                    // Batch skip horizontal tab, vertical tab, and form feed.
625                    // Perl treats these as whitespace separators.
626                    let start = self.position;
627                    while self.position < self.input_bytes.len()
628                        && matches!(
629                            Self::byte_at(self.input_bytes, self.position),
630                            b'\t' | 0x0B | 0x0C
631                        )
632                    {
633                        self.position += 1;
634                    }
635                    if self.position > start {
636                        // Loop naturally continues to next iteration
637                    }
638                }
639                b'\r' | b'\n' => {
640                    self.consume_newline();
641
642                    // Set body_start for the FIRST pending heredoc that needs it (FIFO)
643                    // Only check if we have pending heredocs to avoid unnecessary work
644                    if !self.pending_heredocs.is_empty() {
645                        for spec in &mut self.pending_heredocs {
646                            if spec.body_start == 0 {
647                                spec.body_start = self.position;
648                                break; // Only set for the first unresolved heredoc
649                            }
650                        }
651                    }
652                }
653                b'#' => {
654                    // In ExpectDelimiter mode, '#' is a delimiter, not a comment
655                    if matches!(self.mode, LexerMode::ExpectDelimiter) {
656                        break;
657                    }
658
659                    // Skip line comment using memchr for fast newline search
660                    self.position += 1; // Skip # directly
661
662                    // Use memchr2 to find CR/LF line endings quickly (supports LF, CRLF, and CR)
663                    if let Some(newline_offset) =
664                        memchr::memchr2(b'\n', b'\r', &self.input_bytes[self.position..])
665                    {
666                        self.position += newline_offset;
667                    } else {
668                        // No newline found, skip to end
669                        self.position = self.input_bytes.len();
670                    }
671                }
672                b'=' if self.position == 0
673                    || (self.position > 0
674                        && matches!(self.input_bytes[self.position - 1], b'\n' | b'\r')) =>
675                {
676                    // Check if this starts a POD section (=pod, =head, =over, etc.)
677                    // Use byte-safe checks — avoid slicing &str at arbitrary byte positions
678                    let remaining = &self.input_bytes[self.position..];
679                    if remaining.starts_with(b"=pod")
680                        || remaining.starts_with(b"=head")
681                        || remaining.starts_with(b"=over")
682                        || remaining.starts_with(b"=item")
683                        || remaining.starts_with(b"=back")
684                        || remaining.starts_with(b"=begin")
685                        || remaining.starts_with(b"=end")
686                        || remaining.starts_with(b"=for")
687                        || remaining.starts_with(b"=encoding")
688                    {
689                        // Scan forward for \n=cut (end of POD block)
690                        let search_start = self.position;
691                        let mut found_cut = false;
692                        let bytes = self.input_bytes;
693                        let mut i = search_start;
694                        while i < bytes.len() {
695                            // Look for =cut at the start of a line
696                            if (i == 0 || matches!(bytes[i - 1], b'\n' | b'\r'))
697                                && bytes[i..].starts_with(b"=cut")
698                            {
699                                i += 4; // Skip "=cut"
700                                // Skip rest of the =cut line
701                                while i < bytes.len() && bytes[i] != b'\n' && bytes[i] != b'\r' {
702                                    i += 1;
703                                }
704                                // Consume one line ending sequence if present
705                                if i < bytes.len() && bytes[i] == b'\r' {
706                                    i += 1;
707                                    if i < bytes.len() && bytes[i] == b'\n' {
708                                        i += 1;
709                                    }
710                                } else if i < bytes.len() && bytes[i] == b'\n' {
711                                    i += 1;
712                                }
713                                self.position = i;
714                                found_cut = true;
715                                break;
716                            }
717                            i += 1;
718                        }
719                        if !found_cut {
720                            // POD extends to end of file
721                            self.position = bytes.len();
722                        }
723                        continue;
724                    }
725                    // Not a POD directive - regular '=' token
726                    break;
727                }
728                _ => {
729                    // For non-ASCII whitespace, use char check only when needed
730                    if byte >= 128
731                        && let Some(ch) = self.current_char()
732                        && ch.is_whitespace()
733                    {
734                        self.advance();
735                        continue;
736                    }
737                    break;
738                }
739            }
740        }
741        Some(())
742    }
743
744    fn try_heredoc(&mut self) -> Option<Token> {
745        // `<<` is the left-shift operator, not a heredoc, when we are inside
746        // a parenthesized expression and have just finished a term.
747        // E.g. `(1<<index(...))` — the `1` sets ExpectOperator and paren_depth > 0,
748        // so `<<index` must be the bitshift operator, not a heredoc start.
749        //
750        // We must NOT fire the guard at statement level (paren_depth == 0) because
751        // `print $fh <<END` is valid Perl: `$fh` sets ExpectOperator but `<<END`
752        // is a heredoc.  The depth check distinguishes the two cases.
753        if self.mode == LexerMode::ExpectOperator && self.paren_depth > 0 {
754            return None;
755        }
756
757        // Check for heredoc start
758        if self.peek_byte(0) != Some(b'<') || self.peek_byte(1) != Some(b'<') {
759            return None;
760        }
761
762        let start = self.position;
763        let mut text = String::from("<<");
764        self.position += 2; // Skip <<
765
766        // Check for indented heredoc (~)
767        let allow_indent = if self.current_char() == Some('~') {
768            text.push('~');
769            self.advance();
770            true
771        } else {
772            false
773        };
774
775        // Skip whitespace
776        while let Some(ch) = self.current_char() {
777            if ch == ' ' || ch == '\t' {
778                text.push(ch);
779                self.advance();
780            } else {
781                break;
782            }
783        }
784
785        // Optional backslash disables interpolation, treat like single-quoted label
786        let backslashed = if self.current_char() == Some('\\') {
787            text.push('\\');
788            self.advance();
789            true
790        } else {
791            false
792        };
793
794        // Parse delimiter
795        let delimiter = if self.position < self.input.len() {
796            match self.current_char() {
797                Some('"') if !backslashed => self.parse_quoted_heredoc_delimiter('"', &mut text)?,
798                Some('\'') if !backslashed => {
799                    self.parse_quoted_heredoc_delimiter('\'', &mut text)?
800                }
801                Some('`') if !backslashed => self.parse_quoted_heredoc_delimiter('`', &mut text)?,
802                Some(c) if is_perl_identifier_start(c) => {
803                    // Bare word delimiter
804                    let mut delim = String::new();
805                    while self.position < self.input.len() {
806                        if let Some(c) = self.current_char() {
807                            if is_perl_identifier_continue(c) {
808                                delim.push(c);
809                                text.push(c);
810                                self.advance();
811                            } else {
812                                break;
813                            }
814                        } else {
815                            break;
816                        }
817                    }
818                    delim
819                }
820                _ => {
821                    // Not a valid heredoc delimiter - reset position and return None
822                    // This allows << to be parsed as bitshift operator (e.g., 1 << 2)
823                    self.position = start;
824                    return None;
825                }
826            }
827        } else {
828            // No delimiter found - reset position and return None
829            self.position = start;
830            return None;
831        };
832
833        // For now, return a placeholder token
834        // The actual heredoc body would be parsed later when we encounter it
835        self.mode = LexerMode::ExpectOperator;
836
837        // Recursion depth limit (Issue #443)
838        if self.pending_heredocs.len() >= MAX_HEREDOC_DEPTH {
839            return Some(Token {
840                token_type: TokenType::Error(Arc::from("Heredoc nesting too deep")),
841                text: Arc::from(text),
842                start,
843                end: self.position,
844            });
845        }
846
847        // Queue the heredoc spec with its label
848        self.pending_heredocs.push(HeredocSpec {
849            label: Arc::from(delimiter.as_str()),
850            body_start: 0, // Will be set when we see the newline after this line
851            allow_indent,
852        });
853
854        Some(Token {
855            token_type: TokenType::HeredocStart,
856            text: Arc::from(text),
857            start,
858            end: self.position,
859        })
860    }
861
862    fn try_string(&mut self) -> Option<Token> {
863        let start = self.position;
864        let quote = self.current_char()?;
865
866        match quote {
867            '"' => self.parse_double_quoted_string(start),
868            '\'' => self.parse_single_quoted_string(start),
869            '`' => self.parse_backtick_string(start),
870            'q' if self.peek_char(1) == Some('{') => self.parse_q_string(start),
871            _ => None,
872        }
873    }
874
875    #[inline]
876    fn try_number(&mut self) -> Option<Token> {
877        let start = self.position;
878
879        // Fast byte check for digits - optimized bounds checking
880        let bytes = self.input_bytes;
881        if self.position >= bytes.len() || !Self::byte_at(bytes, self.position).is_ascii_digit() {
882            return None;
883        }
884
885        // Check for hex (0x), binary (0b), or octal (0o) prefixes
886        let mut pos = self.position;
887        if Self::byte_at(bytes, pos) == b'0' && pos + 1 < bytes.len() {
888            let prefix_byte = bytes[pos + 1];
889            if prefix_byte == b'x' || prefix_byte == b'X' {
890                // Hexadecimal: 0x[0-9a-fA-F_]+
891                pos += 2; // consume '0x'
892                let digit_start = pos;
893                let mut saw_digit = false;
894                while pos < bytes.len() && (bytes[pos].is_ascii_hexdigit() || bytes[pos] == b'_') {
895                    saw_digit |= bytes[pos].is_ascii_hexdigit();
896                    pos += 1;
897                }
898                if pos > digit_start && saw_digit {
899                    self.position = pos;
900                    let text = &self.input[start..self.position];
901                    self.mode = LexerMode::ExpectOperator;
902                    return Some(Token {
903                        token_type: TokenType::Number(Arc::from(text)),
904                        text: Arc::from(text),
905                        start,
906                        end: self.position,
907                    });
908                }
909                // No hex digits after 0x - fall through to parse '0' as decimal
910            } else if prefix_byte == b'b' || prefix_byte == b'B' {
911                // Binary: 0b[01_]+
912                pos += 2; // consume '0b'
913                let digit_start = pos;
914                let mut saw_digit = false;
915                while pos < bytes.len()
916                    && (bytes[pos] == b'0' || bytes[pos] == b'1' || bytes[pos] == b'_')
917                {
918                    saw_digit |= bytes[pos] == b'0' || bytes[pos] == b'1';
919                    pos += 1;
920                }
921                if pos > digit_start && saw_digit {
922                    self.position = pos;
923                    let text = &self.input[start..self.position];
924                    self.mode = LexerMode::ExpectOperator;
925                    return Some(Token {
926                        token_type: TokenType::Number(Arc::from(text)),
927                        text: Arc::from(text),
928                        start,
929                        end: self.position,
930                    });
931                }
932                // No binary digits after 0b - fall through to parse '0' as decimal
933            } else if prefix_byte == b'o' || prefix_byte == b'O' {
934                // Octal (explicit): 0o[0-7_]+
935                pos += 2; // consume '0o'
936                let digit_start = pos;
937                let mut saw_digit = false;
938                while pos < bytes.len()
939                    && ((bytes[pos] >= b'0' && bytes[pos] <= b'7') || bytes[pos] == b'_')
940                {
941                    saw_digit |= (b'0'..=b'7').contains(&bytes[pos]);
942                    pos += 1;
943                }
944                if pos > digit_start && saw_digit {
945                    self.position = pos;
946                    let text = &self.input[start..self.position];
947                    self.mode = LexerMode::ExpectOperator;
948                    return Some(Token {
949                        token_type: TokenType::Number(Arc::from(text)),
950                        text: Arc::from(text),
951                        start,
952                        end: self.position,
953                    });
954                }
955                // No octal digits after 0o - fall through to parse '0' as decimal
956            }
957        }
958
959        // Consume initial digits - unrolled for better performance
960        pos = self.position;
961        while pos < bytes.len() {
962            let byte = Self::byte_at(bytes, pos);
963            if byte.is_ascii_digit() || byte == b'_' {
964                pos += 1;
965            } else {
966                break;
967            }
968        }
969        self.position = pos;
970
971        // Check for decimal point - optimized with single bounds check
972        if pos < bytes.len() && Self::byte_at(bytes, pos) == b'.' {
973            // Peek ahead to see what follows the dot
974            let has_following_digit = pos + 1 < bytes.len() && bytes[pos + 1].is_ascii_digit();
975
976            // Optimized dot consumption logic
977            let should_consume_dot = has_following_digit || {
978                pos + 1 >= bytes.len() || {
979                    // Use bitwise operations for faster character classification
980                    let next_byte = bytes[pos + 1];
981                    // Whitespace, delimiters, operators - optimized check
982                    next_byte <= b' '
983                        || matches!(
984                            next_byte,
985                            b';' | b','
986                                | b')'
987                                | b'}'
988                                | b']'
989                                | b'+'
990                                | b'-'
991                                | b'*'
992                                | b'/'
993                                | b'%'
994                                | b'='
995                                | b'<'
996                                | b'>'
997                                | b'!'
998                                | b'&'
999                                | b'|'
1000                                | b'^'
1001                                | b'~'
1002                                | b'e'
1003                                | b'E'
1004                        )
1005                }
1006            };
1007
1008            if should_consume_dot {
1009                pos += 1; // consume the dot
1010                // Consume fractional digits - batch processing
1011                while pos < bytes.len() && (bytes[pos].is_ascii_digit() || bytes[pos] == b'_') {
1012                    pos += 1;
1013                }
1014                self.position = pos;
1015            }
1016        }
1017
1018        // Check for exponent - optimized
1019        if pos < bytes.len() && (bytes[pos] == b'e' || bytes[pos] == b'E') {
1020            let exp_start = pos;
1021            pos += 1; // consume 'e' or 'E'
1022
1023            // Check for optional sign
1024            if pos < bytes.len() && (bytes[pos] == b'+' || bytes[pos] == b'-') {
1025                pos += 1;
1026            }
1027
1028            // Must have at least one digit after exponent (underscores allowed between digits)
1029            let mut saw_digit = false;
1030            while pos < bytes.len() {
1031                let byte = bytes[pos];
1032                if byte.is_ascii_digit() {
1033                    saw_digit = true;
1034                    pos += 1;
1035                } else if byte == b'_' {
1036                    pos += 1;
1037                } else {
1038                    break;
1039                }
1040            }
1041
1042            // If no digits after exponent, backtrack
1043            if !saw_digit {
1044                pos = exp_start;
1045            }
1046
1047            self.position = pos;
1048        }
1049
1050        // Avoid string slicing for common number cases - use Arc::from directly on slice
1051        let text = &self.input[start..self.position];
1052        self.mode = LexerMode::ExpectOperator;
1053
1054        Some(Token {
1055            token_type: TokenType::Number(Arc::from(text)),
1056            text: Arc::from(text),
1057            start,
1058            end: self.position,
1059        })
1060    }
1061
1062    fn parse_decimal_number(&mut self, start: usize) -> Option<Token> {
1063        // We're at the dot, consume it
1064        self.advance();
1065
1066        // Parse the fractional part
1067        while self.position < self.input_bytes.len() {
1068            let byte = self.input_bytes[self.position];
1069            match byte {
1070                b'0'..=b'9' | b'_' => self.position += 1,
1071                b'e' | b'E' => {
1072                    // Handle scientific notation.
1073                    // Save the position of 'e'/'E' so we can backtrack here if
1074                    // no digits follow the exponent marker (with or without sign).
1075                    let e_pos = self.position;
1076                    self.advance();
1077                    if self.position < self.input_bytes.len() {
1078                        let next = self.input_bytes[self.position];
1079                        if next == b'+' || next == b'-' {
1080                            self.advance();
1081                        }
1082                    }
1083                    // Parse exponent digits (underscores allowed between digits)
1084                    let mut saw_digit = false;
1085                    while self.position < self.input_bytes.len() {
1086                        let byte = self.input_bytes[self.position];
1087                        if byte.is_ascii_digit() {
1088                            saw_digit = true;
1089                            self.position += 1;
1090                        } else if byte == b'_' {
1091                            self.position += 1;
1092                        } else {
1093                            break;
1094                        }
1095                    }
1096
1097                    // No digits after exponent marker — backtrack to just before
1098                    // 'e'/'E' so the caller sees it as a separate token.
1099                    // Using e_pos (not exponent_start-1) avoids including 'e' in
1100                    // the number slice when a sign character was consumed.
1101                    if !saw_digit {
1102                        self.position = e_pos;
1103                    }
1104                    break;
1105                }
1106                _ => break,
1107            }
1108        }
1109
1110        let text = &self.input[start..self.position];
1111        self.mode = LexerMode::ExpectOperator;
1112
1113        Some(Token {
1114            token_type: TokenType::Number(Arc::from(text)),
1115            text: Arc::from(text),
1116            start,
1117            end: self.position,
1118        })
1119    }
1120
1121    fn try_variable(&mut self) -> Option<Token> {
1122        let start = self.position;
1123        let sigil = self.current_char()?;
1124
1125        match sigil {
1126            '$' | '@' | '%' | '*' => {
1127                // In ExpectOperator mode, treat % and * as operators rather than sigils
1128                if self.mode == LexerMode::ExpectOperator && matches!(sigil, '*' | '%') {
1129                    return None;
1130                }
1131                self.advance();
1132
1133                // Special case: After ->, sigils followed by { or [ should be tokenized separately
1134                // This is for postfix dereference like ->@*, ->%{}, ->@[]
1135                // We need to be careful with Unicode - check if we have enough bytes and valid char boundaries
1136                let check_arrow = self.position >= 3
1137                    && self.position.saturating_sub(1) <= self.input.len()
1138                    && self.input.is_char_boundary(self.position.saturating_sub(3))
1139                    && self.input.is_char_boundary(self.position.saturating_sub(1));
1140
1141                if check_arrow
1142                    && {
1143                        let saved = self.position;
1144                        self.position -= 3;
1145                        let arrow = self.matches_bytes(b"->");
1146                        self.position = saved;
1147                        arrow
1148                    }
1149                    && matches!(self.current_char(), Some('{' | '[' | '*'))
1150                {
1151                    // Just return the sigil
1152                    let text = &self.input[start..self.position];
1153                    self.mode = LexerMode::ExpectOperator;
1154
1155                    return Some(Token {
1156                        token_type: TokenType::Identifier(Arc::from(text)),
1157                        text: Arc::from(text),
1158                        start,
1159                        end: self.position,
1160                    });
1161                }
1162
1163                // Check for $# (array length operator)
1164                if sigil == '$' && self.current_char() == Some('#') {
1165                    self.advance(); // consume #
1166                    // Now parse the array name
1167                    while let Some(ch) = self.current_char() {
1168                        if is_perl_identifier_continue(ch) {
1169                            self.advance();
1170                        } else if ch == ':' && self.peek_char(1) == Some(':') {
1171                            // Package-qualified array name
1172                            self.advance();
1173                            self.advance();
1174                        } else {
1175                            break;
1176                        }
1177                    }
1178
1179                    let text = &self.input[start..self.position];
1180                    self.mode = LexerMode::ExpectOperator;
1181                    // $#foo is a complete variable token; a following `{` is a subscript.
1182                    self.after_var_subscript = true;
1183
1184                    return Some(Token {
1185                        token_type: TokenType::Identifier(Arc::from(text)),
1186                        text: Arc::from(text),
1187                        start,
1188                        end: self.position,
1189                    });
1190                }
1191
1192                // Check for special cases like ${^MATCH} or ${::{foo}} or *{$glob}
1193                if self.current_char() == Some('{') {
1194                    // Peek ahead to decide if we should consume the brace
1195                    let next_char = self.peek_char(1);
1196
1197                    // Check if this is a dereference like @{$ref} or @{[...]}
1198                    // If the next char suggests dereference, don't consume the brace.
1199                    // For @ and % sigils, identifiers inside braces are also derefs
1200                    // (e.g. @{Foo::Bar::baz} or %{Some::Hash}).
1201                    let is_deref = sigil != '*'
1202                        && (matches!(
1203                            next_char,
1204                            Some('$' | '@' | '%' | '*' | '&' | '[' | ' ' | '\t' | '\n' | '\r',)
1205                        ) || (matches!(sigil, '@' | '%')
1206                            && next_char.is_some_and(is_perl_identifier_start)));
1207                    if is_deref {
1208                        // This is a dereference, don't consume the brace
1209                        let text = &self.input[start..self.position];
1210                        self.mode = LexerMode::ExpectOperator;
1211                        // A standalone sigil token before `{` starts a dereference
1212                        // sequence (e.g. `${$ref}` / `@{$aref}` / `%{$href}` / `&{$cref}`).
1213                        // Mark it as subscript-capable so `{` increments brace depth
1214                        // and the closing `}` can enable chained `{...}` subscripts.
1215                        // (Broader form than master's `$|@|%` filter — `*` is already
1216                        // excluded by the `is_deref` guard above and `&` deref also
1217                        // benefits from chained-subscript handling.)
1218                        self.after_var_subscript = true;
1219
1220                        return Some(Token {
1221                            token_type: TokenType::Identifier(Arc::from(text)),
1222                            text: Arc::from(text),
1223                            start,
1224                            end: self.position,
1225                        });
1226                    }
1227
1228                    self.advance(); // consume {
1229
1230                    // Handle special variables with caret
1231                    if self.current_char() == Some('^') {
1232                        self.advance(); // consume ^
1233                        // Parse the special variable name
1234                        while let Some(ch) = self.current_char() {
1235                            if ch == '}' {
1236                                self.advance(); // consume }
1237                                break;
1238                            } else if is_perl_identifier_continue(ch) {
1239                                self.advance();
1240                            } else {
1241                                break;
1242                            }
1243                        }
1244                    }
1245                    // Handle stash access like $::{foo}
1246                    else if self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1247                        self.advance(); // consume first :
1248                        self.advance(); // consume second :
1249                        // Skip optional { and }
1250                        if self.current_char() == Some('{') {
1251                            self.advance();
1252                        }
1253                        // Parse the name
1254                        while let Some(ch) = self.current_char() {
1255                            if ch == '}' {
1256                                self.advance();
1257                                if self.current_char() == Some('}') {
1258                                    self.advance(); // consume closing } of ${...}
1259                                }
1260                                break;
1261                            } else if is_perl_identifier_continue(ch) {
1262                                self.advance();
1263                            } else {
1264                                break;
1265                            }
1266                        }
1267                    }
1268                    // Regular braced variable like ${foo} or glob like *{$glob}
1269                    else {
1270                        // Check if this is a dereference like ${$ref} or @{$ref} or @{[...]}
1271                        // If the next char is a sigil or other expression starter, we should stop here and let the parser handle it
1272                        // EXCEPT for globs - *{$glob} should be parsed as one token
1273                        // Also check for empty braces or EOF - in these cases we should split the tokens
1274                        if sigil != '*'
1275                            && !self.current_char().is_some_and(is_perl_identifier_start)
1276                        {
1277                            // This is a dereference or empty/invalid brace, backtrack
1278                            self.position = start + 1; // Just past the sigil
1279                            let text = &self.input[start..self.position];
1280                            self.mode = LexerMode::ExpectOperator;
1281                            // Same as above: sigil-only token means a dereference opener.
1282                            self.after_var_subscript = true;
1283
1284                            return Some(Token {
1285                                token_type: TokenType::Identifier(Arc::from(text)),
1286                                text: Arc::from(text),
1287                                start,
1288                                end: self.position,
1289                            });
1290                        }
1291
1292                        // For glob access, we need to consume everything inside braces
1293                        if sigil == '*' {
1294                            let mut brace_depth: usize = 1;
1295                            while let Some(ch) = self.current_char() {
1296                                if ch == '{' {
1297                                    brace_depth += 1;
1298                                } else if ch == '}' {
1299                                    brace_depth = brace_depth.saturating_sub(1);
1300                                    if brace_depth == 0 {
1301                                        self.advance(); // consume final }
1302                                        break;
1303                                    }
1304                                }
1305                                self.advance();
1306                            }
1307                        } else {
1308                            // Regular variable
1309                            while let Some(ch) = self.current_char() {
1310                                if ch == '}' {
1311                                    self.advance(); // consume }
1312                                    break;
1313                                } else if is_perl_identifier_continue(ch) {
1314                                    self.advance();
1315                                } else {
1316                                    break;
1317                                }
1318                            }
1319                        }
1320                    }
1321                }
1322                // Parse regular variable name
1323                else if let Some(ch) = self.current_char() {
1324                    if is_perl_identifier_start(ch) {
1325                        while let Some(ch) = self.current_char() {
1326                            if is_perl_identifier_continue(ch) {
1327                                self.advance();
1328                            } else {
1329                                break;
1330                            }
1331                        }
1332                        // Handle package-qualified segments like Foo::bar
1333                        while self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1334                            self.advance();
1335                            self.advance();
1336                            while let Some(ch) = self.current_char() {
1337                                if is_perl_identifier_continue(ch) {
1338                                    self.advance();
1339                                } else {
1340                                    break;
1341                                }
1342                            }
1343                        }
1344                    }
1345                    // Handle $^Letter (e.g. $^W, $^O, $^X) and bare $^ (format_top_name)
1346                    // Not inside prototypes where ^ is a literal prototype char
1347                    else if sigil == '$' && ch == '^' && !self.in_prototype {
1348                        self.advance(); // consume ^
1349                        // $^Letter: consume the single uppercase letter
1350                        if let Some(letter) = self.current_char()
1351                            && letter.is_ascii_uppercase()
1352                        {
1353                            self.advance();
1354                        }
1355                        // bare $^ (no uppercase letter follows): format_top_name — stop here
1356                    }
1357                    // Handle special punctuation variables
1358                    // Not inside prototypes where ; and , are literal prototype chars
1359                    else if sigil == '$'
1360                        && !self.in_prototype
1361                        && matches!(
1362                            ch,
1363                            '?' | '!'
1364                                | '@'
1365                                | '&'
1366                                | '`'
1367                                | '\''
1368                                | '.'
1369                                | '/'
1370                                | '\\'
1371                                | '|'
1372                                | '+'
1373                                | '-'
1374                                | '['
1375                                | ']'
1376                                | '$'
1377                                | '~'
1378                                | '='
1379                                | '%'
1380                                | ','
1381                                | '"'
1382                                | ';'
1383                                | '>'
1384                                | '<'
1385                                | ')'
1386                                | '(' // $( = real group ID of this process
1387                        )
1388                    {
1389                        self.advance(); // consume the special character
1390                    }
1391                    // $$ is the PID special variable, but only when it is not immediately
1392                    // followed by an identifier-start character. $$var is scalar dereference
1393                    // of $var, so keep the second $ for the next token.
1394                    else if sigil == '$' && ch == '$' {
1395                        if !self.peek_char(1).is_some_and(is_perl_identifier_start) {
1396                            self.advance(); // consume the second $ for bare $$ PID
1397                        }
1398                    }
1399                    // Handle special array/hash punctuation variables
1400                    else if (sigil == '@' || sigil == '%') && matches!(ch, '+' | '-') {
1401                        self.advance(); // consume the + or -
1402                    }
1403                }
1404
1405                let text = &self.input[start..self.position];
1406                self.mode = LexerMode::ExpectOperator;
1407                // A complete $foo, @foo, %foo token can be followed by a hash/slice
1408                // subscript `{`. Set the flag so the `{` handler knows to increment
1409                // hash_brace_depth. Glob tokens (*foo) are excluded: they don't take
1410                // hash subscripts in the same way.
1411                self.after_var_subscript = matches!(sigil, '$' | '@' | '%');
1412
1413                Some(Token {
1414                    token_type: TokenType::Identifier(Arc::from(text)),
1415                    text: Arc::from(text),
1416                    start,
1417                    end: self.position,
1418                })
1419            }
1420            _ => None,
1421        }
1422    }
1423
1424    /// Return the next non-space char and the char immediately following it (without consuming).
1425    /// Used to detect quote-operator delimiters while distinguishing `=>` (fat-arrow autoquote)
1426    /// from `=` used as a plain delimiter.
1427    fn peek_nonspace_and_following(&self) -> (Option<char>, Option<char>) {
1428        let mut i = self.position;
1429        while i < self.input.len() {
1430            let c = match self.input.get(i..).and_then(|s| s.chars().next()) {
1431                Some(c) => c,
1432                None => return (None, None),
1433            };
1434            if c.is_whitespace() {
1435                i += c.len_utf8();
1436                continue;
1437            }
1438            // Found non-space at position i; peek the next char after it
1439            let j = i + c.len_utf8();
1440            let following = self.input.get(j..).and_then(|s| s.chars().next());
1441            return (Some(c), following);
1442        }
1443        (None, None)
1444    }
1445
1446    /// Is `c` a valid quote-like delimiter? (non-alnum, including paired)
1447    fn is_quote_delim(c: char) -> bool {
1448        // Perl allows any non-alphanumeric, non-whitespace character as delimiter,
1449        // including control characters (e.g. s\x07pattern\x07replacement\x07).
1450        !c.is_ascii_alphanumeric() && !c.is_whitespace()
1451    }
1452
1453    #[inline]
1454    fn immediately_follows_sigil_prefix(&self, start: usize) -> bool {
1455        start > 0
1456            && matches!(
1457                Self::byte_at(self.input_bytes, start.saturating_sub(1)),
1458                b'$' | b'@' | b'%' | b'&' | b'*'
1459            )
1460    }
1461
1462    /// Try to parse a v-string (version string) like `v5.26.0` or `v5.10`.
1463    ///
1464    /// A v-string starts with `v` followed by one or more digits, then optionally
1465    /// `.` followed by digits, repeated. The `v` prefix distinguishes these from
1466    /// normal identifiers. Examples: `v5.26.0`, `v5.10`, `v1.2.3.4`.
1467    #[inline]
1468    fn try_vstring(&mut self) -> Option<Token> {
1469        let start = self.position;
1470        let bytes = self.input_bytes;
1471
1472        // Must start with 'v' followed by at least one digit
1473        if start >= bytes.len() || bytes[start] != b'v' {
1474            return None;
1475        }
1476
1477        let next_pos = start + 1;
1478        if next_pos >= bytes.len() || !bytes[next_pos].is_ascii_digit() {
1479            return None;
1480        }
1481
1482        // We have `v` followed by a digit — scan the rest of the v-string.
1483        // Pattern: v DIGITS (.DIGITS)*
1484        let mut pos = next_pos;
1485
1486        // Consume leading digits
1487        while pos < bytes.len() && bytes[pos].is_ascii_digit() {
1488            pos += 1;
1489        }
1490
1491        // Consume optional `.DIGITS` segments (require at least one digit after dot)
1492        while pos < bytes.len() && bytes[pos] == b'.' {
1493            let dot_pos = pos;
1494            pos += 1; // skip '.'
1495
1496            if pos >= bytes.len() || !bytes[pos].is_ascii_digit() {
1497                // Dot not followed by digit — not part of the v-string
1498                pos = dot_pos;
1499                break;
1500            }
1501
1502            // Consume digits after the dot
1503            while pos < bytes.len() && bytes[pos].is_ascii_digit() {
1504                pos += 1;
1505            }
1506        }
1507
1508        // Make sure the v-string isn't followed by identifier-continuation characters
1509        // (e.g. `v5x` should remain an identifier, not a v-string `v5` + `x`)
1510        if pos < bytes.len() {
1511            let next_byte = bytes[pos];
1512            if next_byte == b'_' || next_byte.is_ascii_alphabetic() {
1513                return None;
1514            }
1515            // Also check for non-ASCII identifier continuations
1516            if next_byte >= 128
1517                && let Some(ch) = self.input.get(pos..).and_then(|s| s.chars().next())
1518                && is_perl_identifier_continue(ch)
1519            {
1520                return None;
1521            }
1522        }
1523
1524        // `v5` (no dots) is a valid Perl v-string meaning chr(5).
1525        let text = &self.input[start..pos];
1526
1527        self.position = pos;
1528        self.mode = LexerMode::ExpectOperator;
1529
1530        Some(Token {
1531            token_type: TokenType::Version(Arc::from(text)),
1532            text: Arc::from(text),
1533            start,
1534            end: self.position,
1535        })
1536    }
1537
1538    #[inline]
1539    fn apostrophe_starts_legacy_package_segment(&self, position: usize) -> bool {
1540        let next_position = position + '\''.len_utf8();
1541        self.input
1542            .get(next_position..)
1543            .and_then(|suffix| suffix.chars().next())
1544            .is_some_and(is_perl_identifier_start)
1545    }
1546
1547    #[inline]
1548    fn try_identifier_or_keyword(&mut self) -> Option<Token> {
1549        let start = self.position;
1550        let ch = self.current_char()?;
1551        let bytes = self.input_bytes;
1552        let len = bytes.len();
1553
1554        if is_perl_identifier_start(ch) {
1555            // Special case: substitution/transliteration with single-quote delimiter
1556            // The single quote is considered an identifier continuation, so we need to
1557            // detect these operators before consuming it as part of an identifier.
1558            let follows_sigil_prefix = self.immediately_follows_sigil_prefix(start);
1559            if !follows_sigil_prefix
1560                && !self.after_arrow
1561                && self.hash_brace_depth == 0
1562                && ch == 's'
1563                && self.peek_char(1) == Some('\'')
1564            {
1565                self.advance(); // consume 's'
1566                return self.parse_substitution(start);
1567            } else if !follows_sigil_prefix
1568                && !self.after_arrow
1569                && self.hash_brace_depth == 0
1570                && ch == 'y'
1571                && self.peek_char(1) == Some('\'')
1572            {
1573                self.advance(); // consume 'y'
1574                return self.parse_transliteration(start);
1575            } else if !follows_sigil_prefix
1576                && !self.after_arrow
1577                && self.hash_brace_depth == 0
1578                && ch == 't'
1579                && self.peek_char(1) == Some('r')
1580                && self.peek_char(2) == Some('\'')
1581            {
1582                self.advance(); // consume 't'
1583                self.advance(); // consume 'r'
1584                return self.parse_transliteration(start);
1585            }
1586
1587            // Fast ASCII path for identifier continuation.
1588            while self.position < len {
1589                let byte = bytes[self.position];
1590                if byte == b'\'' {
1591                    if is_quote_op_word_prefix(&bytes[start..self.position])
1592                        || !self.apostrophe_starts_legacy_package_segment(self.position)
1593                    {
1594                        // Keep apostrophe for quote/string parsing in cases like q'...'
1595                        // and split' ', while still accepting Foo'Bar package spelling.
1596                        break;
1597                    }
1598                    self.position += 1;
1599                    continue;
1600                }
1601
1602                if byte.is_ascii_alphanumeric() || byte == b'_' {
1603                    self.position += 1;
1604                    continue;
1605                }
1606
1607                if byte < 128 {
1608                    break;
1609                }
1610
1611                if let Some(ch) = self.current_char()
1612                    && is_perl_identifier_continue(ch)
1613                {
1614                    self.advance();
1615                    continue;
1616                }
1617                break;
1618            }
1619            // Handle package-qualified identifiers like Foo::bar.
1620            while self.config.max_lookahead >= 1
1621                && self.position + 1 < len
1622                && bytes[self.position] == b':'
1623                && bytes[self.position + 1] == b':'
1624            {
1625                self.position += 2; // consume '::'
1626
1627                // consume following identifier segment if present
1628                let Some(ch) = self.current_char() else {
1629                    break;
1630                };
1631                if !is_perl_identifier_start(ch) {
1632                    break;
1633                }
1634                self.advance();
1635                while self.position < len {
1636                    let byte = bytes[self.position];
1637                    if byte == b'\'' {
1638                        if !self.apostrophe_starts_legacy_package_segment(self.position) {
1639                            break;
1640                        }
1641                        self.position += 1;
1642                        continue;
1643                    }
1644
1645                    if byte.is_ascii_alphanumeric() || byte == b'_' {
1646                        self.position += 1;
1647                        continue;
1648                    }
1649                    if byte < 128 {
1650                        break;
1651                    }
1652                    if let Some(ch) = self.current_char()
1653                        && is_perl_identifier_continue(ch)
1654                    {
1655                        self.advance();
1656                        continue;
1657                    }
1658                    break;
1659                }
1660            }
1661
1662            let text = &self.input[start..self.position];
1663
1664            // Check for __DATA__ and __END__ markers using exact match
1665            // Only recognize these in code channel, not inside data/format sections or heredocs
1666            let in_code_channel =
1667                !matches!(self.mode, LexerMode::InDataSection | LexerMode::InFormatBody)
1668                    && self.pending_heredocs.is_empty();
1669
1670            let marker = if in_code_channel {
1671                if text == "__DATA__" {
1672                    Some("__DATA__")
1673                } else if text == "__END__" {
1674                    Some("__END__")
1675                } else {
1676                    None
1677                }
1678            } else {
1679                None
1680            };
1681
1682            if let Some(marker_text) = marker {
1683                // These must be at the beginning of a line
1684                // Use the after_newline flag to determine if we're at line start
1685                if self.after_newline {
1686                    // Check if rest of line is only whitespace
1687                    // Only treat as data marker if line has no trailing junk
1688                    if Self::trailing_ws_only(self.input_bytes, self.position) {
1689                        // Consume the rest of the line (the marker line)
1690                        while self.position < self.input.len()
1691                            && self.input_bytes[self.position] != b'\n'
1692                            && self.input_bytes[self.position] != b'\r'
1693                        {
1694                            self.advance();
1695                        }
1696                        self.consume_newline();
1697
1698                        // Switch to data section mode
1699                        self.mode = LexerMode::InDataSection;
1700
1701                        return Some(Token {
1702                            token_type: TokenType::DataMarker(Arc::from(marker_text)),
1703                            text: Arc::from(marker_text),
1704                            start,
1705                            end: self.position,
1706                        });
1707                    }
1708                }
1709            }
1710
1711            // Check for substitution/transliteration operators
1712            // Skip if after '->'  -- these are method names, not operators.
1713            #[allow(clippy::collapsible_if)]
1714            if !self.after_sub
1715                && !self.after_arrow
1716                && !follows_sigil_prefix
1717                && self.hash_brace_depth == 0
1718                && matches!(text, "s" | "tr" | "y")
1719            {
1720                let immediate = self.current_char();
1721                let (candidate, char_after_next, has_whitespace) =
1722                    if immediate.is_some_and(|c| c.is_whitespace()) {
1723                        let (nc, ca) = self.peek_nonspace_and_following();
1724                        (nc, ca, true)
1725                    } else {
1726                        let following = immediate.and_then(|c| {
1727                            let j = self.position + c.len_utf8();
1728                            self.input.get(j..).and_then(|s| s.chars().next())
1729                        });
1730                        (immediate, following, false)
1731                    };
1732
1733                if let Some(next) = candidate {
1734                    // `s => 1` should remain a fat-arrow hash key, not quote op.
1735                    let is_fat_arrow = next == '=' && char_after_next == Some('>');
1736                    let is_filetest_s = text == "s"
1737                        && self.input.get(..start).is_some_and(|prefix| prefix.ends_with('-'));
1738                    let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
1739                    let is_quote_char = matches!(next, '\'' | '"') && text != "s";
1740                    let transliteration_allows_whitespace = text == "tr" || text == "y";
1741                    let substitution_disallows_whitespace = text == "s" && has_whitespace;
1742                    let is_valid_delim = Self::is_quote_delim(next)
1743                        && !is_fat_arrow
1744                        && !is_filetest_s
1745                        && !substitution_disallows_whitespace
1746                        && (!has_whitespace
1747                            || is_paired_delim
1748                            || is_quote_char
1749                            || transliteration_allows_whitespace);
1750
1751                    if is_valid_delim {
1752                        match text {
1753                            "s" => return self.parse_substitution(start),
1754                            "tr" | "y" => return self.parse_transliteration(start),
1755                            unexpected => {
1756                                return Some(Token {
1757                                    token_type: TokenType::Error(Arc::from(format!(
1758                                        "Unexpected substitution operator '{}': expected 's', 'tr', or 'y' at position {}",
1759                                        unexpected, start
1760                                    ))),
1761                                    text: Arc::from(unexpected),
1762                                    start,
1763                                    end: self.position,
1764                                });
1765                            }
1766                        }
1767                    }
1768                }
1769            }
1770
1771            let token_type = if is_keyword_fast(text) {
1772                // Check for special keywords that affect lexer mode
1773                match text {
1774                    "if" | "unless" | "while" | "until" | "for" | "foreach" | "grep" | "map"
1775                    | "sort" | "split" | "and" | "or" | "xor" | "not"
1776                    // These keywords introduce an expression, so a following `/` is a
1777                    // regex, not division.  `return /re/`, `die /re/`, `warn /re/`,
1778                    // `do /file/`, and `eval /re/` are all valid Perl.
1779                    | "return" | "die" | "warn" | "do" | "eval" => {
1780                        self.mode = LexerMode::ExpectTerm;
1781                    }
1782                    "sub" => {
1783                        self.after_sub = true;
1784                        self.mode = LexerMode::ExpectTerm;
1785                    }
1786                    // Quote operators expect a delimiter next.
1787                    // Skip if after '->' -- these are method names, not operators.
1788                    // Inside hash subscript braces, regex-like operators stay bareword
1789                    // keys (`@h{m, s}`), but q-family operators can still introduce real
1790                    // quote expressions in slices (`@h{qw/a b/}`).
1791                    op if !self.after_sub
1792                        && !self.after_arrow
1793                        && !follows_sigil_prefix
1794                        && quote_handler::is_quote_operator(op)
1795                        && (self.hash_brace_depth == 0
1796                            || matches!(op, "q" | "qq" | "qw" | "qr" | "qx")) =>
1797                    {
1798                        // Perl allows whitespace between a quote-like operator and its delimiter,
1799                        // but ONLY for paired delimiters (s { ... } { ... }g).
1800                        // For non-paired delimiters (s/foo/bar/, s,foo,bar,), the delimiter
1801                        // must be immediately adjacent — otherwise `s $foo` would wrongly
1802                        // treat `$` as a delimiter instead of being a bareword `s` followed
1803                        // by a scalar variable.
1804                        //
1805                        // Strategy:
1806                        //   1. Check the immediately-adjacent char first (no whitespace skip).
1807                        //      If it is a valid delimiter → any non-alnum, non-whitespace char.
1808                        //   2. If the adjacent char is whitespace, peek past it.
1809                        //      Only accept PAIRED delimiters ({, [, (, <) in that case.
1810                        let immediate = self.current_char();
1811                        let (candidate, char_after_next, has_whitespace) =
1812                            if immediate.is_some_and(|c| c.is_whitespace()) {
1813                                // There is whitespace — peek past it
1814                                let (nc, ca) = self.peek_nonspace_and_following();
1815                                (nc, ca, true)
1816                            } else {
1817                                // No whitespace — use immediate char
1818                                let following = immediate.and_then(|c| {
1819                                    let j = self.position + c.len_utf8();
1820                                    self.input.get(j..).and_then(|s| s.chars().next())
1821                                });
1822                                (immediate, following, false)
1823                            };
1824
1825                        if let Some(next) = candidate {
1826                            // Fat-arrow autoquoting: `s => value` — `=` followed by `>` is '=>',
1827                            // not a valid substitution delimiter. Treat as identifier.
1828                            let is_fat_arrow = next == '=' && char_after_next == Some('>');
1829                            let is_filetest_s =
1830                                op == "s" && self.input.get(..start).is_some_and(|prefix| {
1831                                    prefix.ends_with('-')
1832                                });
1833
1834                            // When whitespace precedes the delimiter, only unambiguous
1835                            // delimiters are accepted:
1836                            //   - Paired delimiters ({, [, (, <) are always safe.
1837                            //   - ' and " are safe for all operators EXCEPT `s` — `-s 'filename'`
1838                            //     is a valid file-size filetest and must not be treated as a
1839                            //     substitution start. All other operators (qw, q, qq, qr, qx, m,
1840                            //     tr, y) have no corresponding file-test operator.
1841                            //   - / is safe for non-substitution quote operators; `qw /a b/` and
1842                            //     `m /re/` are common, while `s /foo/bar/` remains ambiguous with
1843                            //     the file-size test shape and stays rejected here.
1844                            //   - Non-paired, non-quote chars ($, @, ,, etc.) remain rejected.
1845                            let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
1846                            let is_quote_char = matches!(next, '\'' | '"') && op != "s";
1847                            let is_spaced_slash_delim = next == '/' && op != "s";
1848                            let is_hash_subscript_bare_key_boundary =
1849                                self.hash_brace_depth > 0 && matches!(next, ',' | '}');
1850                            let is_valid_delim = Self::is_quote_delim(next)
1851                                && !is_fat_arrow
1852                                && !is_filetest_s
1853                                && !is_hash_subscript_bare_key_boundary
1854                                && (!has_whitespace
1855                                    || is_paired_delim
1856                                    || is_quote_char
1857                                    || is_spaced_slash_delim);
1858
1859                            if is_valid_delim {
1860                                self.mode = LexerMode::ExpectDelimiter;
1861                                self.current_quote_op = Some(quote_handler::QuoteOperatorInfo {
1862                                    operator: op.to_string(),
1863                                    delimiter: '\0', // Will be set when we see the delimiter
1864                                    start_pos: start,
1865                                });
1866
1867                                // Don't return a keyword token - continue to parse the delimiter
1868                                // Skip any whitespace between operator and delimiter
1869                                while let Some(ch) = self.current_char() {
1870                                    if ch.is_whitespace() {
1871                                        self.advance();
1872                                    } else {
1873                                        break;
1874                                    }
1875                                }
1876
1877                                // Get the delimiter
1878                                #[allow(clippy::collapsible_if)]
1879                                if let Some(delim) = self.current_char() {
1880                                    if !delim.is_alphanumeric() {
1881                                        self.advance();
1882                                        if let Some(ref mut info) = self.current_quote_op {
1883                                            info.delimiter = delim;
1884                                        }
1885                                        // Parse the quote operator content and return the complete token
1886                                        return self.parse_quote_operator(delim);
1887                                    }
1888                                }
1889                            } else {
1890                                // Not a quote operator here → treat as IDENTIFIER
1891                                self.current_quote_op = None;
1892                                self.mode = LexerMode::ExpectOperator;
1893                                return Some(Token {
1894                                    token_type: TokenType::Identifier(Arc::from(text)),
1895                                    start,
1896                                    end: self.position,
1897                                    text: Arc::from(text),
1898                                });
1899                            }
1900                        } else {
1901                            // End-of-input after the word → also treat as IDENTIFIER
1902                            self.current_quote_op = None;
1903                            self.mode = LexerMode::ExpectOperator;
1904                            return Some(Token {
1905                                token_type: TokenType::Identifier(Arc::from(text)),
1906                                start,
1907                                end: self.position,
1908                                text: Arc::from(text),
1909                            });
1910                        }
1911                        // If we get here but haven't returned, something went wrong
1912                        // Fall through to treat as identifier
1913                        self.current_quote_op = None;
1914                        self.mode = LexerMode::ExpectOperator;
1915                        return Some(Token {
1916                            token_type: TokenType::Identifier(Arc::from(text)),
1917                            start,
1918                            end: self.position,
1919                            text: Arc::from(text),
1920                        });
1921                    }
1922                    // Format declarations need special handling
1923                    "format" => {
1924                        // We'll need to check for the = after the format name
1925                        // For now, just mark that we saw format
1926                    }
1927                    _ if is_builtin_function(text) => {
1928                        // Bare builtins are term-introducing in Perl.
1929                        self.mode = LexerMode::ExpectTerm;
1930                    }
1931                    _ => {
1932                        self.mode = LexerMode::ExpectOperator;
1933                    }
1934                }
1935                TokenType::Keyword(Arc::from(text))
1936            } else {
1937                // Mirror parser bare-builtin handling so `/` after builtins like
1938                // `join` or `print` is lexed as a regex term, not division.
1939                if is_builtin_function(text) {
1940                    self.mode = LexerMode::ExpectTerm;
1941                } else {
1942                    self.mode = LexerMode::ExpectOperator;
1943                }
1944                TokenType::Identifier(Arc::from(text))
1945            };
1946
1947            self.after_arrow = false;
1948            // A keyword/identifier is not a variable; `{` after it is a block opener.
1949            self.after_var_subscript = false;
1950            // hash_brace_depth is managed by { and } handlers, not cleared per-token
1951            Some(Token { token_type, text: Arc::from(text), start, end: self.position })
1952        } else {
1953            None
1954        }
1955    }
1956
1957    /// Parse data section body - consumes everything to EOF
1958    fn parse_data_body(&mut self) -> Option<Token> {
1959        if self.position >= self.input.len() {
1960            // Already at EOF
1961            self.mode = LexerMode::ExpectTerm;
1962            return Some(Token {
1963                token_type: TokenType::EOF,
1964                text: Arc::from(""),
1965                start: self.position,
1966                end: self.position,
1967            });
1968        }
1969
1970        let start = self.position;
1971        // Consume everything to EOF
1972        let body = &self.input[self.position..];
1973        self.position = self.input.len();
1974
1975        // Reset mode for next parse (though we're at EOF)
1976        self.mode = LexerMode::ExpectTerm;
1977
1978        Some(Token {
1979            token_type: TokenType::DataBody(Arc::from(body)),
1980            text: Arc::from(body),
1981            start,
1982            end: self.position,
1983        })
1984    }
1985
1986    /// Parse format body - consumes until a line with just a dot
1987    fn parse_format_body(&mut self) -> Option<Token> {
1988        let start = self.position;
1989        let mut body = String::new();
1990        let mut line_start = true;
1991
1992        while self.position < self.input.len() {
1993            // Check if we're at the start of a line and the next char is a dot
1994            if line_start && self.current_char() == Some('.') {
1995                // Check if this line contains only a dot
1996                let mut peek_pos = self.position + 1;
1997                let mut found_terminator = true;
1998
1999                // Skip any trailing whitespace on the dot line
2000                while peek_pos < self.input.len() {
2001                    match self.input_bytes[peek_pos] {
2002                        b' ' | b'\t' | b'\r' => peek_pos += 1,
2003                        b'\n' => break,
2004                        _ => {
2005                            found_terminator = false;
2006                            break;
2007                        }
2008                    }
2009                }
2010
2011                if found_terminator {
2012                    // We found the terminating dot, consume it
2013                    self.position = peek_pos;
2014                    if self.position < self.input.len() && self.input_bytes[self.position] == b'\n'
2015                    {
2016                        self.position += 1;
2017                    }
2018
2019                    // Switch back to normal mode
2020                    self.mode = LexerMode::ExpectTerm;
2021
2022                    return Some(Token {
2023                        token_type: TokenType::FormatBody(Arc::from(body.clone())),
2024                        text: Arc::from(body),
2025                        start,
2026                        end: self.position,
2027                    });
2028                }
2029            }
2030
2031            // Not a terminator, consume the character
2032            match self.current_char() {
2033                Some(ch) => {
2034                    body.push(ch);
2035                    self.advance();
2036
2037                    // Track if we're at the start of a line
2038                    line_start = ch == '\n';
2039                }
2040                None => {
2041                    // Reached EOF without finding terminator
2042                    break;
2043                }
2044            }
2045        }
2046
2047        // If we reach here, we didn't find a terminator
2048        self.mode = LexerMode::ExpectTerm;
2049        Some(Token {
2050            token_type: TokenType::Error(Arc::from("Unterminated format body")),
2051            text: Arc::from(body),
2052            start,
2053            end: self.position,
2054        })
2055    }
2056
2057    fn try_operator(&mut self) -> Option<Token> {
2058        // Skip operator parsing if we're expecting a delimiter for a quote operator
2059        if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2060            return None;
2061        }
2062
2063        let start = self.position;
2064        let ch = self.current_char()?;
2065
2066        // ═══════════════════════════════════════════════════════════════════════
2067        // SLASH DISAMBIGUATION STRATEGY (Issue #422)
2068        // ═══════════════════════════════════════════════════════════════════════
2069        //
2070        // Perl's `/` character is ambiguous:
2071        //   - Division operator: `$x / 2`
2072        //   - Regex delimiter: `/pattern/`
2073        //   - Defined-or operator: `$x // $y`
2074        //
2075        // **Disambiguation Strategy (Context-Aware Heuristics):**
2076        //
2077        // 1. **Mode-Based Decision (Primary)**:
2078        //    - `LexerMode::ExpectTerm` → `/` starts a regex
2079        //      Examples: `if (/pattern/)`, `=~ /test/`, `( /regex/`
2080        //    - `LexerMode::ExpectOperator` → `/` is division or `//`
2081        //      Examples: `$x / 2`, `$x // $y`, `) / 3`
2082        //
2083        // 2. **Context Heuristics (Secondary - Implicit in Mode)**:
2084        //    Mode is set based on previous token:
2085        //    - After identifier/number/closing paren → ExpectOperator → division
2086        //    - After operator/keyword/opening paren → ExpectTerm → regex
2087        //
2088        // 3. **Budget Protection**:
2089        //    - Regex parsing has a parse-step budget and byte budget
2090        //    - Budget exceeded → emit UnknownRest token (graceful degradation)
2091        //    - See `parse_regex()` and `budget_guard()` for implementation
2092        //
2093        // 4. **Performance Characteristics**:
2094        //    - Single-pass: O(1) decision based on mode flag
2095        //    - No backtracking: Mode updated after each token
2096        //    - Optimized: Byte-level operations for common cases
2097        //
2098        // **Metrics & Monitoring**:
2099        //    - Budget exceeded events tracked via UnknownRest token emission
2100        //    - LSP diagnostics generated for truncated regexes
2101        //    - Test coverage: lexer_slash_timeout_tests.rs (21 test cases)
2102        //
2103        // ═══════════════════════════════════════════════════════════════════════
2104
2105        if ch == '/' {
2106            if self.mode == LexerMode::ExpectTerm {
2107                // Mode indicates we're expecting a term → `/` starts a regex
2108                // Examples: `if (/pattern/)`, `=~ /test/`, `while (/match/)`
2109                return self.parse_regex(start);
2110            } else {
2111                // Mode indicates we're expecting an operator → `/` is division or `//`
2112                // Examples: `$x / 2`, `$x // $y`, `10 / 3`
2113                self.advance();
2114                // Check for // or //= using byte-level operations for speed
2115                if self.peek_byte(0) == Some(b'/') {
2116                    self.position += 1; // consume second / directly
2117                    if self.peek_byte(0) == Some(b'=') {
2118                        self.position += 1; // consume = directly
2119                        let text = &self.input[start..self.position];
2120                        self.mode = LexerMode::ExpectTerm;
2121                        return Some(Token {
2122                            token_type: TokenType::Operator(Arc::from(text)),
2123                            text: Arc::from(text),
2124                            start,
2125                            end: self.position,
2126                        });
2127                    } else {
2128                        // Use cached string for common "//" operator
2129                        self.mode = LexerMode::ExpectTerm;
2130                        return Some(Token {
2131                            token_type: TokenType::Operator(Arc::from("//")),
2132                            text: Arc::from("//"),
2133                            start,
2134                            end: self.position,
2135                        });
2136                    }
2137                } else if self.position < self.input_bytes.len()
2138                    && self.input_bytes[self.position] == b'='
2139                {
2140                    // /= division-assign operator
2141                    self.position += 1; // consume =
2142                    self.mode = LexerMode::ExpectTerm;
2143                    return Some(Token {
2144                        token_type: TokenType::Operator(Arc::from("/=")),
2145                        text: Arc::from("/="),
2146                        start,
2147                        end: self.position,
2148                    });
2149                } else {
2150                    // Use cached string for common "/" division
2151                    self.mode = LexerMode::ExpectTerm;
2152                    return Some(Token {
2153                        token_type: TokenType::Division,
2154                        text: Arc::from("/"),
2155                        start,
2156                        end: self.position,
2157                    });
2158                }
2159            }
2160        }
2161
2162        // Handle other operators - simplified
2163        match ch {
2164            '.' => {
2165                // Check if it's a decimal number like .5 -- but only when we
2166                // expect a term.  In operator position `.5` is concatenation
2167                // of the bareword/number on the left with the number `5`.
2168                if self.mode != LexerMode::ExpectOperator
2169                    && self.peek_char(1).is_some_and(|c| c.is_ascii_digit())
2170                {
2171                    return self.parse_decimal_number(start);
2172                }
2173                self.advance();
2174                // Check for compound operators
2175                #[allow(clippy::collapsible_if)]
2176                if let Some(next) = self.current_char() {
2177                    if is_compound_operator(ch, next) {
2178                        self.advance();
2179
2180                        // Check for three-character operators like **=, <<=, >>=
2181                        if self.position < self.input.len() {
2182                            let third = self.current_char();
2183                            // Check for three-character operators
2184                            if matches!(
2185                                (ch, next, third),
2186                                ('*', '*', Some('='))
2187                                    | ('<', '<', Some('='))
2188                                    | ('>', '>', Some('='))
2189                                    | ('&', '&', Some('='))
2190                                    | ('|', '|', Some('='))
2191                                    | ('/', '/', Some('='))
2192                            ) {
2193                                self.advance(); // consume the =
2194                            } else if ch == '<' && next == '=' && third == Some('>') {
2195                                self.advance(); // consume the >
2196                            // Special case: <=> spaceship operator
2197                            } else if ch == '.' && next == '.' && third == Some('.') {
2198                                self.advance(); // consume the third .
2199                            }
2200                        }
2201                    }
2202                }
2203            }
2204            '+' | '-' | '*' | '%' | '&' | '|' | '^' | '~' | '!' | '=' | '<' | '>' | ':' | '?'
2205            | '\\' => {
2206                self.advance();
2207                // Check for compound operators
2208                #[allow(clippy::collapsible_if)]
2209                if let Some(next) = self.current_char() {
2210                    if is_compound_operator(ch, next) {
2211                        self.advance();
2212
2213                        // Check for three-character operators like **=, <<=, >>=
2214                        if self.position < self.input.len() {
2215                            let third = self.current_char();
2216                            // Check for three-character operators
2217                            if matches!(
2218                                (ch, next, third),
2219                                ('*', '*', Some('='))
2220                                    | ('<', '<', Some('='))
2221                                    | ('>', '>', Some('='))
2222                                    | ('&', '&', Some('='))
2223                                    | ('|', '|', Some('='))
2224                                    | ('/', '/', Some('='))
2225                            ) {
2226                                self.advance(); // consume the =
2227                            } else if ch == '<' && next == '=' && third == Some('>') {
2228                                self.advance(); // consume the >
2229                                // Special case: <=> spaceship operator
2230                            }
2231                        }
2232                    }
2233                }
2234            }
2235            _ => return None,
2236        }
2237
2238        let text = &self.input[start..self.position];
2239        // Operator ends prototype window (e.g. `:` for attributes)
2240        self.after_sub = false;
2241        // Track whether this operator is '->' for method name disambiguation
2242        self.after_arrow = text == "->";
2243        // Any operator token ends the "just saw a variable" window; `{` after
2244        // an operator is not a hash subscript (e.g. `foo() {`, `+ {`, etc.).
2245        self.after_var_subscript = false;
2246        // Postfix ++ and -- complete a term expression, so next token is an operator
2247        // (e.g., "$x++ / 2" → / is division, not regex)
2248        if (text == "++" || text == "--") && self.mode == LexerMode::ExpectOperator {
2249            // Postfix: stay in ExpectOperator
2250        } else {
2251            self.mode = LexerMode::ExpectTerm;
2252        }
2253
2254        Some(Token {
2255            token_type: TokenType::Operator(Arc::from(text)),
2256            text: Arc::from(text),
2257            start,
2258            end: self.position,
2259        })
2260    }
2261
2262    fn try_delimiter(&mut self) -> Option<Token> {
2263        let start = self.position;
2264        let ch = self.current_char()?;
2265
2266        // If we're expecting a delimiter for a quote operator, handle it specially
2267        if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2268            // Accept any non-alphanumeric character as a delimiter
2269            if !ch.is_alphanumeric() && !ch.is_whitespace() {
2270                self.advance();
2271                if let Some(ref mut info) = self.current_quote_op {
2272                    info.delimiter = ch;
2273                }
2274                // Now parse the quote operator content
2275                return self.parse_quote_operator(ch);
2276            }
2277        }
2278
2279        match ch {
2280            '(' => {
2281                // Check if this is a quote operator delimiter
2282                if matches!(self.mode, LexerMode::ExpectDelimiter)
2283                    && self.current_quote_op.is_some()
2284                {
2285                    self.advance();
2286                    if let Some(ref mut info) = self.current_quote_op {
2287                        info.delimiter = ch;
2288                    }
2289                    return self.parse_quote_operator(ch);
2290                }
2291
2292                self.advance();
2293                if self.after_sub {
2294                    // Promote after_sub to in_prototype now that we see '('
2295                    self.in_prototype = true;
2296                    self.after_sub = false;
2297                    self.prototype_depth = 1;
2298                } else if self.in_prototype {
2299                    self.prototype_depth += 1;
2300                }
2301                self.paren_depth += 1;
2302                self.after_var_subscript = false;
2303                self.mode = LexerMode::ExpectTerm;
2304                Some(Token {
2305                    token_type: TokenType::LeftParen,
2306                    text: Arc::from("("),
2307                    start,
2308                    end: self.position,
2309                })
2310            }
2311            ')' => {
2312                self.advance();
2313                if self.in_prototype && self.prototype_depth > 0 {
2314                    self.prototype_depth -= 1;
2315                    if self.prototype_depth == 0 {
2316                        self.in_prototype = false;
2317                    }
2318                }
2319                self.after_arrow = false;
2320                self.paren_depth = self.paren_depth.saturating_sub(1);
2321                // A closing paren ends any var-subscript context: `if ($var)` should
2322                // NOT leave after_var_subscript set, otherwise the following `{` would
2323                // incorrectly increment hash_brace_depth and suppress regex operators
2324                // inside the block body (issue #2844).
2325                self.after_var_subscript = false;
2326                self.mode = LexerMode::ExpectOperator;
2327                Some(Token {
2328                    token_type: TokenType::RightParen,
2329                    text: Arc::from(")"),
2330                    start,
2331                    end: self.position,
2332                })
2333            }
2334            ';' => {
2335                self.advance();
2336                // Semicolon ends prototype window (forward declaration)
2337                self.after_sub = false;
2338                // Semicolon is a statement boundary — any pending method-call chain is over.
2339                self.after_arrow = false;
2340                self.after_var_subscript = false;
2341                self.mode = LexerMode::ExpectTerm;
2342                Some(Token {
2343                    token_type: TokenType::Semicolon,
2344                    text: Arc::from(";"),
2345                    start,
2346                    end: self.position,
2347                })
2348            }
2349            ',' => {
2350                self.advance();
2351                self.after_var_subscript = false;
2352                self.mode = LexerMode::ExpectTerm;
2353                Some(Token {
2354                    token_type: TokenType::Comma,
2355                    text: Arc::from(","),
2356                    start,
2357                    end: self.position,
2358                })
2359            }
2360            '[' => {
2361                self.advance();
2362                self.after_var_subscript = false;
2363                self.mode = LexerMode::ExpectTerm;
2364                Some(Token {
2365                    token_type: TokenType::LeftBracket,
2366                    text: Arc::from("["),
2367                    start,
2368                    end: self.position,
2369                })
2370            }
2371            ']' => {
2372                self.advance();
2373                // A closing `]` from an array subscript leaves us in a state where
2374                // a `{` immediately following is a hash subscript — e.g. `$arr[$i]{key}`.
2375                // Set after_var_subscript so the `{` handler recognises it as such.
2376                // This mirrors the `}` handler's behavior when closing a hash subscript.
2377                self.after_var_subscript = true;
2378                self.mode = LexerMode::ExpectOperator;
2379                Some(Token {
2380                    token_type: TokenType::RightBracket,
2381                    text: Arc::from("]"),
2382                    start,
2383                    end: self.position,
2384                })
2385            }
2386            '{' => {
2387                self.advance();
2388                // Opening brace ends prototype window — no prototype follows
2389                self.after_sub = false;
2390                // `{` is a hash/slice subscript opener only when it immediately follows
2391                // a variable token ($x, @x, %x) — tracked by `after_var_subscript`.
2392                // This is narrower than the old `mode == ExpectOperator` check, which
2393                // incorrectly incremented depth for block-opening braces after `sub foo`,
2394                // `if (cond)`, `else`, `while (cond)`, etc., causing quote-op suppression
2395                // inside those block bodies and breaking m//, s///, qr//, tr/// etc.
2396                if self.after_var_subscript {
2397                    self.hash_brace_depth = self.hash_brace_depth.saturating_add(1);
2398                }
2399                self.after_var_subscript = false;
2400                self.mode = LexerMode::ExpectTerm;
2401                Some(Token {
2402                    token_type: TokenType::LeftBrace,
2403                    text: Arc::from("{"),
2404                    start,
2405                    end: self.position,
2406                })
2407            }
2408            '}' => {
2409                self.advance();
2410                self.after_arrow = false;
2411                // Decrement hash subscript brace depth only if we were inside one.
2412                // If depth > 0, this closes a hash subscript; enable chained subscripts
2413                // like $h{a}{b} by setting after_var_subscript so the next `{` is
2414                // recognized as another subscript opener.
2415                if self.hash_brace_depth > 0 {
2416                    self.hash_brace_depth -= 1;
2417                    // The subscript value is now the "variable" for a chained subscript.
2418                    self.after_var_subscript = true;
2419                } else {
2420                    // Block-close `}` — no subscript follows
2421                    self.after_var_subscript = false;
2422                }
2423                self.mode = LexerMode::ExpectOperator;
2424                Some(Token {
2425                    token_type: TokenType::RightBrace,
2426                    text: Arc::from("}"),
2427                    start,
2428                    end: self.position,
2429                })
2430            }
2431            '#' => {
2432                // Only treat as delimiter in ExpectDelimiter mode
2433                if matches!(self.mode, LexerMode::ExpectDelimiter) {
2434                    self.advance();
2435                    // Reset mode after consuming delimiter
2436                    self.mode = LexerMode::ExpectTerm;
2437                    Some(Token {
2438                        token_type: TokenType::Operator(Arc::from("#")),
2439                        text: Arc::from("#"),
2440                        start,
2441                        end: self.position,
2442                    })
2443                } else {
2444                    None
2445                }
2446            }
2447            _ => None,
2448        }
2449    }
2450
2451    fn parse_double_quoted_string(&mut self, start: usize) -> Option<Token> {
2452        self.advance(); // Skip opening quote
2453        let mut parts = Vec::new();
2454        let mut current_literal = String::new();
2455        let mut last_pos = self.position;
2456
2457        while let Some(ch) = self.current_char() {
2458            match ch {
2459                '"' => {
2460                    self.advance();
2461                    if !current_literal.is_empty() {
2462                        parts.push(StringPart::Literal(Arc::from(current_literal)));
2463                    }
2464
2465                    let text = &self.input[start..self.position];
2466                    self.mode = LexerMode::ExpectOperator;
2467
2468                    return Some(Token {
2469                        token_type: if parts.is_empty() {
2470                            TokenType::StringLiteral
2471                        } else {
2472                            TokenType::InterpolatedString(parts)
2473                        },
2474                        text: Arc::from(text),
2475                        start,
2476                        end: self.position,
2477                    });
2478                }
2479                '\\' => {
2480                    self.advance();
2481                    if let Some(escaped) = self.current_char() {
2482                        // Optimize by reserving space to avoid frequent reallocations
2483                        if current_literal.capacity() == 0 {
2484                            current_literal.reserve(32);
2485                        }
2486                        current_literal.push('\\');
2487                        current_literal.push(escaped);
2488                        self.advance();
2489                    }
2490                }
2491                '$' if self.config.parse_interpolation => {
2492                    // Handle variable interpolation - avoid unnecessary clone
2493                    if !current_literal.is_empty() {
2494                        parts.push(StringPart::Literal(Arc::from(current_literal)));
2495                        current_literal = String::new(); // Clear without cloning
2496                    }
2497
2498                    let part_start = self.position;
2499                    self.advance();
2500                    match self.current_char() {
2501                        Some('{') => {
2502                            let _ = self.consume_balanced_segment_in_string('{', '}', '"');
2503                            parts.push(StringPart::Expression(Arc::from(
2504                                &self.input[part_start..self.position],
2505                            )));
2506                        }
2507                        Some(ch) if is_perl_identifier_start(ch) => {
2508                            let var_start = self.position;
2509
2510                            // Fast path for ASCII identifier continuation
2511                            while self.position < self.input_bytes.len() {
2512                                let byte = self.input_bytes[self.position];
2513                                if byte.is_ascii_alphanumeric() || byte == b'_' {
2514                                    self.position += 1;
2515                                } else if byte >= 128 {
2516                                    // Only use UTF-8 parsing for non-ASCII
2517                                    if let Some(ch) = self.current_char() {
2518                                        if is_perl_identifier_continue(ch) {
2519                                            self.advance();
2520                                        } else {
2521                                            break;
2522                                        }
2523                                    } else {
2524                                        break;
2525                                    }
2526                                } else {
2527                                    break;
2528                                }
2529                            }
2530
2531                            if self.position > var_start {
2532                                let var_name = &self.input[part_start..self.position];
2533                                parts.push(StringPart::Variable(Arc::from(var_name)));
2534
2535                                if self.matches_bytes(b"->") {
2536                                    let tail_start = self.position;
2537                                    self.advance();
2538                                    self.advance();
2539
2540                                    match self.current_char() {
2541                                        Some('[') => {
2542                                            let _ = self
2543                                                .consume_balanced_segment_in_string('[', ']', '"');
2544                                            parts.push(StringPart::MethodCall(Arc::from(
2545                                                &self.input[tail_start..self.position],
2546                                            )));
2547                                        }
2548                                        Some('{') => {
2549                                            let _ = self
2550                                                .consume_balanced_segment_in_string('{', '}', '"');
2551                                            parts.push(StringPart::MethodCall(Arc::from(
2552                                                &self.input[tail_start..self.position],
2553                                            )));
2554                                        }
2555                                        Some('(') => {
2556                                            let _ = self
2557                                                .consume_balanced_segment_in_string('(', ')', '"');
2558                                            parts.push(StringPart::MethodCall(Arc::from(
2559                                                &self.input[tail_start..self.position],
2560                                            )));
2561                                        }
2562                                        Some(ch) if is_perl_identifier_start(ch) => {
2563                                            while self.position < self.input_bytes.len() {
2564                                                let byte = self.input_bytes[self.position];
2565                                                if byte.is_ascii_alphanumeric() || byte == b'_' {
2566                                                    self.position += 1;
2567                                                } else if byte >= 128 {
2568                                                    if let Some(ch) = self.current_char() {
2569                                                        if is_perl_identifier_continue(ch) {
2570                                                            self.advance();
2571                                                        } else {
2572                                                            break;
2573                                                        }
2574                                                    } else {
2575                                                        break;
2576                                                    }
2577                                                } else {
2578                                                    break;
2579                                                }
2580                                            }
2581                                            if self.current_char() == Some('(') {
2582                                                let _ = self.consume_balanced_segment_in_string(
2583                                                    '(', ')', '"',
2584                                                );
2585                                            }
2586                                            parts.push(StringPart::MethodCall(Arc::from(
2587                                                &self.input[tail_start..self.position],
2588                                            )));
2589                                        }
2590                                        _ => {
2591                                            parts.push(StringPart::MethodCall(Arc::from(
2592                                                &self.input[tail_start..self.position],
2593                                            )));
2594                                        }
2595                                    }
2596                                } else if self.current_char() == Some('[') {
2597                                    let tail_start = self.position;
2598                                    let _ = self.consume_balanced_segment_in_string('[', ']', '"');
2599                                    parts.push(StringPart::ArraySlice(Arc::from(
2600                                        &self.input[tail_start..self.position],
2601                                    )));
2602                                } else if self.current_char() == Some('{') {
2603                                    let tail_start = self.position;
2604                                    let _ = self.consume_balanced_segment_in_string('{', '}', '"');
2605                                    parts.push(StringPart::Expression(Arc::from(
2606                                        &self.input[tail_start..self.position],
2607                                    )));
2608                                }
2609                            }
2610                        }
2611                        _ => {}
2612                    }
2613                }
2614                _ => {
2615                    // Optimize string building with better capacity management
2616                    if current_literal.capacity() == 0 {
2617                        current_literal.reserve(32);
2618                    }
2619                    current_literal.push(ch);
2620                    self.advance();
2621                }
2622            }
2623
2624            // Safety check: ensure we're making progress
2625            if self.position == last_pos {
2626                break;
2627            }
2628            last_pos = self.position;
2629        }
2630
2631        Some(self.unterminated_string_error(start))
2632    }
2633
2634    fn parse_single_quoted_string(&mut self, start: usize) -> Option<Token> {
2635        self.advance(); // Skip opening quote
2636
2637        let mut last_pos = self.position;
2638
2639        while let Some(ch) = self.current_char() {
2640            match ch {
2641                '\'' => {
2642                    self.advance();
2643                    let text = &self.input[start..self.position];
2644                    self.mode = LexerMode::ExpectOperator;
2645
2646                    return Some(Token {
2647                        token_type: TokenType::StringLiteral,
2648                        text: Arc::from(text),
2649                        start,
2650                        end: self.position,
2651                    });
2652                }
2653                '\\' => {
2654                    self.advance();
2655                    if self.current_char() == Some('\'') || self.current_char() == Some('\\') {
2656                        self.advance();
2657                    }
2658                }
2659                _ => self.advance(),
2660            }
2661
2662            // Safety check: ensure we're making progress
2663            if self.position == last_pos {
2664                break;
2665            }
2666            last_pos = self.position;
2667        }
2668
2669        Some(self.unterminated_string_error(start))
2670    }
2671
2672    fn parse_backtick_string(&mut self, start: usize) -> Option<Token> {
2673        self.advance(); // Skip opening backtick
2674
2675        let mut last_pos = self.position;
2676
2677        while let Some(ch) = self.current_char() {
2678            match ch {
2679                '`' => {
2680                    self.advance();
2681                    let text = &self.input[start..self.position];
2682                    self.mode = LexerMode::ExpectOperator;
2683
2684                    return Some(Token {
2685                        token_type: TokenType::QuoteCommand,
2686                        text: Arc::from(text),
2687                        start,
2688                        end: self.position,
2689                    });
2690                }
2691                '\\' => {
2692                    self.advance();
2693                    if self.current_char().is_some() {
2694                        self.advance();
2695                    }
2696                }
2697                _ => self.advance(),
2698            }
2699
2700            // Safety check: ensure we're making progress
2701            if self.position == last_pos {
2702                break;
2703            }
2704            last_pos = self.position;
2705        }
2706
2707        Some(self.unterminated_string_error(start))
2708    }
2709
2710    fn parse_q_string(&mut self, _start: usize) -> Option<Token> {
2711        // Simplified q-string parsing
2712        None
2713    }
2714
2715    #[inline]
2716    fn unterminated_string_error(&mut self, start: usize) -> Token {
2717        // Consume to EOF so the caller receives a single terminal error token.
2718        let end = self.input.len();
2719        self.position = end;
2720
2721        Token {
2722            token_type: TokenType::Error(Arc::from("unterminated string")),
2723            text: Arc::from(&self.input[start..end]),
2724            start,
2725            end,
2726        }
2727    }
2728
2729    fn parse_substitution(&mut self, start: usize) -> Option<Token> {
2730        // We've already consumed 's'
2731        let delimiter = self.current_char()?;
2732        self.advance(); // Skip delimiter
2733        self.parse_substitution_with_delimiter(start, delimiter)
2734    }
2735
2736    fn parse_substitution_with_delimiter(
2737        &mut self,
2738        start: usize,
2739        delimiter: char,
2740    ) -> Option<Token> {
2741        let (_pattern, pattern_closed) = self.read_delimited_body(delimiter);
2742        let replacement_closed;
2743
2744        let pattern_is_paired = quote_handler::paired_close(delimiter).is_some();
2745        if pattern_is_paired {
2746            self.skip_paired_substitution_replacement_gap();
2747
2748            if let Some(repl_delim) = self.current_char()
2749                && Self::is_quote_delim(repl_delim)
2750            {
2751                self.advance();
2752                let (_replacement, closed) = self.read_substitution_replacement_body(repl_delim);
2753                replacement_closed = closed;
2754            } else {
2755                replacement_closed = false;
2756            }
2757        } else {
2758            let (_replacement, closed) = self.read_substitution_replacement_body(delimiter);
2759            replacement_closed = closed;
2760        }
2761
2762        // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
2763        while let Some(ch) = self.current_char() {
2764            if ch.is_ascii_alphanumeric() {
2765                self.advance();
2766            } else {
2767                break;
2768            }
2769        }
2770
2771        let text = &self.input[start..self.position];
2772        self.mode = LexerMode::ExpectOperator;
2773
2774        let token_type = if pattern_closed && replacement_closed {
2775            TokenType::Substitution
2776        } else {
2777            TokenType::Error(Arc::from(format!(
2778                "unclosed quote-like operator 's' delimiter '{}'",
2779                delimiter
2780            )))
2781        };
2782
2783        Some(Token { token_type, text: Arc::from(text), start, end: self.position })
2784    }
2785
2786    fn skip_paired_substitution_replacement_gap(&mut self) {
2787        let mut comment_eligible = false;
2788        loop {
2789            let mut saw_whitespace = false;
2790            while self.current_char().is_some_and(char::is_whitespace) {
2791                self.advance();
2792                saw_whitespace = true;
2793            }
2794            comment_eligible |= saw_whitespace;
2795
2796            if comment_eligible && self.current_char() == Some('#') {
2797                while let Some(ch) = self.current_char() {
2798                    self.advance();
2799                    if matches!(ch, '\n' | '\r') {
2800                        break;
2801                    }
2802                }
2803                comment_eligible = true;
2804                continue;
2805            }
2806
2807            break;
2808        }
2809    }
2810
2811    fn read_substitution_replacement_body(&mut self, delim: char) -> (String, bool) {
2812        if quote_handler::paired_close(delim).is_some() {
2813            return self.read_delimited_body(delim);
2814        }
2815
2816        self.read_unpaired_substitution_replacement_body(delim)
2817    }
2818
2819    fn read_unpaired_substitution_replacement_body(&mut self, delim: char) -> (String, bool) {
2820        let mut body = String::new();
2821        let mut escaped = false;
2822
2823        while let Some(ch) = self.current_char() {
2824            if escaped {
2825                body.push(ch);
2826                self.advance();
2827                escaped = false;
2828                continue;
2829            }
2830
2831            match ch {
2832                '\\' => {
2833                    body.push(ch);
2834                    self.advance();
2835                    escaped = true;
2836                }
2837                '"' | '\'' if ch != delim => {
2838                    if let Some((string_end, true)) =
2839                        self.scan_inner_string_for_delimiter(self.position, ch, delim)
2840                    {
2841                        if let Some(string_text) = self.input.get(self.position..string_end) {
2842                            body.push_str(string_text);
2843                            self.position = string_end;
2844                        } else {
2845                            body.push(ch);
2846                            self.advance();
2847                        }
2848                    } else {
2849                        body.push(ch);
2850                        self.advance();
2851                    }
2852                }
2853                c if c == delim => {
2854                    self.advance();
2855                    return (body, true);
2856                }
2857                _ => {
2858                    body.push(ch);
2859                    self.advance();
2860                }
2861            }
2862        }
2863
2864        (body, false)
2865    }
2866
2867    fn scan_inner_string_for_delimiter(
2868        &self,
2869        start: usize,
2870        quote: char,
2871        delim: char,
2872    ) -> Option<(usize, bool)> {
2873        if Self::is_word_apostrophe(self.input, start, quote) {
2874            return None;
2875        }
2876        // Adjacent quotes are literal replacement text (for example s/"/""/g),
2877        // not a string literal to skip while hunting for the replacement delimiter.
2878        if self.input.get(..start).and_then(|text| text.chars().next_back()) == Some(quote) {
2879            return None;
2880        }
2881        let mut pos = start.checked_add(quote.len_utf8())?;
2882        let expression_quote = Self::can_start_replacement_expression_quote(self.input, start);
2883        if !expression_quote && self.input.get(pos..).is_some_and(|text| text.starts_with(delim)) {
2884            return None;
2885        }
2886        if self.input.get(pos..).is_some_and(|text| text.starts_with(quote)) {
2887            return None;
2888        }
2889        let mut escaped = false;
2890        let mut contains_delim = false;
2891
2892        while let Some(ch) = self.input.get(pos..).and_then(|text| text.chars().next()) {
2893            if matches!(ch, '\n' | '\r') {
2894                return None;
2895            }
2896            if !expression_quote && matches!(ch, ';' | '#') {
2897                return None;
2898            }
2899
2900            if escaped {
2901                if ch == delim {
2902                    contains_delim = true;
2903                }
2904                pos += ch.len_utf8();
2905                escaped = false;
2906                continue;
2907            }
2908
2909            match ch {
2910                '\\' => {
2911                    pos += ch.len_utf8();
2912                    escaped = true;
2913                }
2914                c if c == quote => {
2915                    return Some((pos + ch.len_utf8(), contains_delim));
2916                }
2917                c if c == delim => {
2918                    contains_delim = true;
2919                    pos += ch.len_utf8();
2920                }
2921                _ => {
2922                    pos += ch.len_utf8();
2923                }
2924            }
2925        }
2926
2927        None
2928    }
2929
2930    // Only skip delimiter-bearing inner strings in positions that look like
2931    // replacement expressions; literal replacement quotes still let the next
2932    // delimiter close the substitution.
2933    fn can_start_replacement_expression_quote(input: &str, pos: usize) -> bool {
2934        input
2935            .get(..pos)
2936            .and_then(|text| text.chars().rev().find(|ch| !ch.is_whitespace()))
2937            .is_some_and(|ch| {
2938                matches!(
2939                    ch,
2940                    '(' | '['
2941                        | '{'
2942                        | ','
2943                        | '='
2944                        | ':'
2945                        | '?'
2946                        | '!'
2947                        | '~'
2948                        | '+'
2949                        | '-'
2950                        | '*'
2951                        | '%'
2952                        | '&'
2953                        | '|'
2954                        | '^'
2955                        | '<'
2956                        | '>'
2957                )
2958            })
2959    }
2960
2961    fn is_word_apostrophe(input: &str, pos: usize, quote: char) -> bool {
2962        quote == '\''
2963            && input
2964                .get(..pos)
2965                .and_then(|text| text.chars().next_back())
2966                .is_some_and(|ch| ch.is_ascii_alphanumeric() || ch == '_')
2967    }
2968
2969    fn parse_transliteration(&mut self, start: usize) -> Option<Token> {
2970        // We've already consumed 'tr' or 'y'
2971        while self.current_char().is_some_and(char::is_whitespace) {
2972            self.advance();
2973        }
2974
2975        let delimiter = self.current_char()?;
2976        self.advance(); // Skip delimiter
2977        self.parse_transliteration_with_delimiter(start, delimiter)
2978    }
2979
2980    fn parse_transliteration_with_delimiter(
2981        &mut self,
2982        start: usize,
2983        delimiter: char,
2984    ) -> Option<Token> {
2985        let (_search, search_closed) = self.read_delimited_body(delimiter);
2986        let replacement_closed;
2987
2988        let search_is_paired = quote_handler::paired_close(delimiter).is_some();
2989        if search_is_paired {
2990            while self.current_char().is_some_and(char::is_whitespace) {
2991                self.advance();
2992            }
2993
2994            if let Some(repl_delim) = self.current_char()
2995                && Self::is_quote_delim(repl_delim)
2996            {
2997                self.advance();
2998                let (_replacement, closed) = self.read_delimited_body(repl_delim);
2999                replacement_closed = closed;
3000            } else {
3001                replacement_closed = false;
3002            }
3003        } else {
3004            let (_replacement, closed) = self.read_delimited_body(delimiter);
3005            replacement_closed = closed;
3006        }
3007
3008        // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
3009        while let Some(ch) = self.current_char() {
3010            if ch.is_ascii_alphanumeric() {
3011                self.advance();
3012            } else {
3013                break;
3014            }
3015        }
3016
3017        let text = &self.input[start..self.position];
3018        self.mode = LexerMode::ExpectOperator;
3019
3020        let token_type = if search_closed && replacement_closed {
3021            TokenType::Transliteration
3022        } else {
3023            TokenType::Error(Arc::from(format!(
3024                "unclosed quote-like operator '{}' delimiter '{}'",
3025                if self.input[start..].starts_with("tr") { "tr" } else { "y" },
3026                delimiter
3027            )))
3028        };
3029
3030        Some(Token { token_type, text: Arc::from(text), start, end: self.position })
3031    }
3032
3033    /// Read content between delimiters.
3034    ///
3035    /// Returns `(body, closed)` where `closed` is `true` if the closing
3036    /// delimiter was found before EOF, and `false` if EOF was reached first.
3037    fn read_delimited_body(&mut self, delim: char) -> (String, bool) {
3038        let paired = quote_handler::paired_close(delim);
3039        let close = paired.unwrap_or(delim);
3040        let mut body = String::new();
3041        let mut depth = i32::from(paired.is_some());
3042
3043        while let Some(ch) = self.current_char() {
3044            if ch == '\\' {
3045                body.push(ch);
3046                self.advance();
3047                if let Some(next) = self.current_char() {
3048                    body.push(next);
3049                    self.advance();
3050                }
3051                continue;
3052            }
3053
3054            if paired.is_some() && ch == delim {
3055                body.push(ch);
3056                self.advance();
3057                depth += 1;
3058                continue;
3059            }
3060
3061            if ch == close {
3062                if paired.is_some() {
3063                    depth -= 1;
3064                    if depth == 0 {
3065                        self.advance();
3066                        return (body, true);
3067                    }
3068                    body.push(ch);
3069                    self.advance();
3070                } else {
3071                    self.advance();
3072                    return (body, true);
3073                }
3074                continue;
3075            }
3076
3077            body.push(ch);
3078            self.advance();
3079        }
3080
3081        // EOF reached without finding the closing delimiter
3082        (body, false)
3083    }
3084
3085    /// Parse a quote operator after we've seen the delimiter
3086    fn parse_quote_operator(&mut self, delimiter: char) -> Option<Token> {
3087        let info = self.current_quote_op.as_ref()?;
3088        let start = info.start_pos;
3089        let operator = info.operator.clone();
3090
3091        // Clear the quote-op context eagerly so any early-return path (s/tr/y delegations
3092        // below) does not leave a stale reference behind. The post-match cleanup at the
3093        // bottom of this function would otherwise be skipped for those operators.
3094        self.current_quote_op = None;
3095
3096        // Parse based on operator type; track whether all delimiters were closed.
3097        let closed = match operator.as_str() {
3098            "s" => {
3099                return self.parse_substitution_with_delimiter(start, delimiter);
3100            }
3101            "tr" | "y" => {
3102                return self.parse_transliteration_with_delimiter(start, delimiter);
3103            }
3104            "qr" => {
3105                let (_pattern, body_closed) = self.read_delimited_body(delimiter);
3106                self.parse_regex_modifiers(&quote_handler::QR_SPEC);
3107                body_closed
3108            }
3109            "m" => {
3110                let (_pattern, body_closed) = self.read_delimited_body(delimiter);
3111                self.parse_regex_modifiers(&quote_handler::M_SPEC);
3112                body_closed
3113            }
3114            _ => {
3115                // q, qq, qw, qx - no modifiers
3116                let (_body, body_closed) = self.read_delimited_body(delimiter);
3117                body_closed
3118            }
3119        };
3120
3121        let text = &self.input[start..self.position];
3122
3123        self.mode = LexerMode::ExpectOperator;
3124
3125        if !closed {
3126            // EOF reached before finding the closing delimiter — emit an error
3127            // token so the parser's recovery mechanism records a diagnostic.
3128            return Some(Token {
3129                token_type: TokenType::Error(Arc::from(format!(
3130                    "unclosed {} delimiter '{}'",
3131                    operator, delimiter
3132                ))),
3133                text: Arc::from(text),
3134                start,
3135                end: self.position,
3136            });
3137        }
3138
3139        let token_type = quote_handler::get_quote_token_type(&operator);
3140        Some(Token { token_type, text: Arc::from(text), start, end: self.position })
3141    }
3142
3143    /// Parse regex modifiers according to the given spec
3144    ///
3145    /// This function includes ALL characters that could be intended as modifiers,
3146    /// including invalid ones. This allows the parser to properly reject invalid
3147    /// modifiers with a clear error message, rather than leaving them as separate
3148    /// tokens that could be confusingly parsed.
3149    fn parse_regex_modifiers(&mut self, _spec: &quote_handler::ModSpec) {
3150        // Consume all alphanumeric characters that could be intended as modifiers
3151        // The parser will validate and reject invalid ones
3152        while let Some(ch) = self.current_char() {
3153            if ch.is_ascii_alphanumeric() {
3154                self.advance();
3155            } else {
3156                break;
3157            }
3158        }
3159        // Note: We no longer validate here - the parser will validate and provide
3160        // clear error messages for invalid modifiers (MUT_005 fix)
3161    }
3162
3163    /// Parse a regex literal starting with `/`
3164    ///
3165    /// **Budget Protection (Issue #422)**:
3166    /// - Budget guards prevent runaway scanning on pathological input
3167    /// - `MAX_REGEX_PARSE_STEPS` bounds literal scanning before the byte budget
3168    /// - `MAX_REGEX_BYTES` bounds total bytes consumed in a single regex literal
3169    /// - Graceful degradation: emit UnknownRest token if budget exceeded
3170    ///
3171    /// **Performance**:
3172    /// - Single-pass scanning with escape handling
3173    /// - Budget check per iteration (amortized O(1) via inline fast path)
3174    /// - Typical regex: <10μs, Large regex (64KB): ~1ms
3175    fn parse_regex(&mut self, start: usize) -> Option<Token> {
3176        self.advance(); // Skip opening /
3177
3178        let mut regex_parse_steps: usize = 0;
3179        let mut in_character_class = false;
3180
3181        while let Some(ch) = self.current_char() {
3182            regex_parse_steps += 1;
3183            if regex_parse_steps > MAX_REGEX_PARSE_STEPS {
3184                #[cfg(debug_assertions)]
3185                {
3186                    let text = &self.input[start..self.position];
3187                    let preview = truncate_preview(text, 50);
3188                    tracing::debug!(
3189                        limit = MAX_REGEX_PARSE_STEPS,
3190                        pattern_preview = %preview,
3191                        "Regex parse step budget exceeded"
3192                    );
3193                }
3194                self.position = self.input.len();
3195                return Some(Token {
3196                    token_type: TokenType::UnknownRest,
3197                    text: empty_arc(),
3198                    start,
3199                    end: self.position,
3200                });
3201            }
3202
3203            // Budget guard: prevent timeout on pathological input (Issue #422)
3204            // If exceeded, returns UnknownRest token for graceful degradation
3205            if let Some(token) = self.budget_guard(start, 0) {
3206                return Some(token);
3207            }
3208
3209            match ch {
3210                '/' if !in_character_class => {
3211                    self.advance();
3212                    // Parse flags - include all alphanumeric for proper validation in parser (MUT_005 fix)
3213                    while let Some(ch) = self.current_char() {
3214                        if ch.is_ascii_alphanumeric() {
3215                            self.advance();
3216                        } else {
3217                            break;
3218                        }
3219                    }
3220
3221                    let text = &self.input[start..self.position];
3222                    self.mode = LexerMode::ExpectOperator;
3223
3224                    return Some(Token {
3225                        token_type: TokenType::RegexMatch,
3226                        text: Arc::from(text),
3227                        start,
3228                        end: self.position,
3229                    });
3230                }
3231                '\\' => {
3232                    // Handle escape sequences: consume backslash + next char
3233                    self.advance();
3234                    if self.current_char().is_some() {
3235                        self.advance();
3236                    }
3237                }
3238                '[' => {
3239                    in_character_class = true;
3240                    self.advance();
3241                }
3242                ']' if in_character_class => {
3243                    in_character_class = false;
3244                    self.advance();
3245                }
3246                _ => self.advance(),
3247            }
3248        }
3249
3250        // Unterminated regex - EOF reached before closing /
3251        // Parser will emit diagnostic for unterminated literal
3252        None
3253    }
3254}
3255
3256// Checkpoint support for incremental parsing
3257
3258mod checkpoint_impl;
3259
3260#[cfg(test)]
3261mod test_format_debug;
3262#[cfg(test)]
3263mod tests;
perl_lexer/lib.rs

perl_lexer/
lib.rs