perl_lexer/
lib.rs

1//! Context-aware Perl lexer with mode-based tokenization
2//!
3//! This crate provides a high-performance lexer for Perl that handles the inherently
4//! context-sensitive nature of the language. The lexer uses a mode-tracking system to
5//! correctly disambiguate ambiguous syntax like `/` (division vs. regex) and properly
6//! parse complex constructs like heredocs, quote-like operators, and nested delimiters.
7//!
8//! # Architecture
9//!
10//! The lexer is organized around several key concepts:
11//!
12//! - **Mode Tracking**: [`LexerMode`] tracks whether the parser expects a term or an operator,
13//!   enabling correct disambiguation of context-sensitive tokens.
14//! - **Checkpointing**: [`LexerCheckpoint`] and [`Checkpointable`] support incremental parsing
15//!   by allowing the lexer state to be saved and restored.
16//! - **Budget Limits**: Protection against pathological input with configurable size limits
17//!   for regex patterns, heredoc bodies, and delimiter nesting depth.
18//! - **Position Tracking**: [`Position`] maintains line/column information for error reporting
19//!   and LSP integration.
20//! - **Unicode Support**: Full Unicode identifier support following Perl 5.14+ semantics.
21//!
22//! # Usage
23//!
24//! ## Basic Tokenization
25//!
26//! ```rust
27//! use perl_lexer::{PerlLexer, TokenType};
28//!
29//! let mut lexer = PerlLexer::new("my $x = 42;");
30//! let tokens = lexer.collect_tokens();
31//!
32//! // First token is the keyword `my`
33//! assert!(matches!(&tokens[0].token_type, TokenType::Keyword(k) if &**k == "my"));
34//! // Tokens include variables, operators, literals, and EOF
35//! assert!(matches!(&tokens.last().map(|t| &t.token_type), Some(TokenType::EOF)));
36//! ```
37//!
38//! ## Context-Aware Parsing
39//!
40//! The lexer automatically tracks context to disambiguate operators:
41//!
42//! ```rust
43//! use perl_lexer::{PerlLexer, TokenType};
44//!
45//! // Division operator (after a term)
46//! let mut lexer = PerlLexer::new("42 / 2");
47//! // Regex operator (at start of expression)
48//! let mut lexer2 = PerlLexer::new("/pattern/");
49//! ```
50//!
51//! ## Checkpointing for Incremental Parsing
52//!
53//! ```rust,ignore
54//! use perl_lexer::{PerlLexer, Checkpointable};
55//!
56//! let mut lexer = PerlLexer::new("my $x = 1;");
57//! let checkpoint = lexer.checkpoint();
58//!
59//! // Parse some tokens
60//! let _ = lexer.next_token();
61//!
62//! // Restore to checkpoint
63//! lexer.restore(&checkpoint);
64//! ```
65//!
66//! ## Configuration Options
67//!
68//! ```rust
69//! use perl_lexer::{PerlLexer, LexerConfig};
70//!
71//! let config = LexerConfig {
72//!     parse_interpolation: true,  // Parse string interpolation
73//!     track_positions: true,      // Track line/column positions
74//!     max_lookahead: 1024,        // Maximum lookahead for disambiguation
75//! };
76//!
77//! let mut lexer = PerlLexer::with_config("my $x = 1;", config);
78//! ```
79//!
80//! # Context Sensitivity Examples
81//!
82//! Perl's grammar is highly context-sensitive. The lexer handles these cases:
83//!
84//! - **Division vs. Regex**: `/` is division after terms, regex at expression start
85//! - **Modulo vs. Hash Sigil**: `%` is modulo after terms, hash sigil at expression start
86//! - **Glob vs. Exponent**: `**` can be exponentiation or glob pattern start
87//! - **Defined-or vs. Regex**: `//` is defined-or after terms, regex at expression start
88//! - **Heredoc Markers**: `<<` can be left shift, here-doc, or numeric less-than-less-than
89//!
90//! # Budget Limits
91//!
92//! To prevent hangs on pathological input, the lexer enforces these limits:
93//!
94//! - **MAX_REGEX_BYTES**: 64KB maximum for regex patterns
95//! - **MAX_HEREDOC_BYTES**: 256KB maximum for heredoc bodies
96//! - **MAX_DELIM_NEST**: 128 levels maximum nesting depth for delimiters
97//!
98//! When limits are exceeded, the lexer emits an `UnknownRest` token preserving
99//! all previously parsed symbols, allowing continued analysis.
100//!
101//! # Integration with perl-parser
102//!
103//! The lexer is designed to work seamlessly with `perl_parser::Parser`:
104//!
105//! ```rust,ignore
106//! use perl_parser::Parser;
107//!
108//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
109//! let code = "sub hello { print qq{Hello, world!\\n}; }";
110//! let mut parser = Parser::new(code);
111//! let ast = parser.parse()?;
112//! # Ok(())
113//! # }
114//! ```
115//!
116//! The parser automatically creates and manages a `PerlLexer` instance internally.
117
118#![warn(clippy::all)]
119#![allow(
120    // Core allows for lexer code
121    clippy::too_many_lines,
122    clippy::module_name_repetitions,
123    clippy::cast_possible_truncation,
124    clippy::cast_sign_loss,
125    clippy::cast_possible_wrap,
126    clippy::cast_precision_loss,
127    clippy::must_use_candidate,
128    clippy::missing_errors_doc,
129    clippy::missing_panics_doc,
130
131    // Lexer-specific patterns that are fine
132    clippy::match_same_arms,
133    clippy::redundant_else,
134    clippy::unnecessary_wraps,
135    clippy::unused_self,
136    clippy::items_after_statements,
137    clippy::struct_excessive_bools,
138    clippy::uninlined_format_args
139)]
140
141use perl_keywords::is_lexer_keyword;
142use std::sync::{Arc, OnceLock};
143
144pub mod checkpoint;
145pub mod error;
146pub mod mode;
147mod quote_handler;
148pub mod token;
149mod unicode;
150
151pub use checkpoint::{CheckpointCache, Checkpointable, LexerCheckpoint};
152pub use error::{LexerError, Result};
153pub use mode::LexerMode;
154pub use perl_position_tracking::Position;
155pub use token::{StringPart, Token, TokenType};
156
157use unicode::{is_perl_identifier_continue, is_perl_identifier_start};
158
159/// Specification for a pending heredoc
160#[derive(Clone)]
161struct HeredocSpec {
162    label: Arc<str>,
163    body_start: usize,  // byte offset where the body begins
164    allow_indent: bool, // true if we saw <<~ (Perl 5.26 indented heredocs)
165}
166
167// Budget limits to prevent hangs on pathological input
168// When these limits are exceeded, the lexer gracefully truncates the token
169// as UnknownRest, preserving all previously parsed symbols and allowing
170// continued analysis of the remainder. LSP clients may emit a soft diagnostic
171// about truncation but won't crash or hang.
172const MAX_REGEX_BYTES: usize = 64 * 1024; // 64KB max for regex patterns
173const MAX_HEREDOC_BYTES: usize = 256 * 1024; // 256KB max for heredoc bodies
174const MAX_DELIM_NEST: usize = 128; // Max nesting depth for delimiters
175const MAX_HEREDOC_DEPTH: usize = 100; // Max nesting depth for heredocs
176const HEREDOC_TIMEOUT_MS: u64 = 5000; // 5 seconds timeout for heredoc parsing
177
178/// Configuration for the lexer
179#[derive(Debug, Clone)]
180pub struct LexerConfig {
181    /// Enable interpolation parsing in strings
182    pub parse_interpolation: bool,
183    /// Track token positions for error reporting
184    pub track_positions: bool,
185    /// Maximum lookahead for disambiguation
186    pub max_lookahead: usize,
187}
188
189impl Default for LexerConfig {
190    fn default() -> Self {
191        Self { parse_interpolation: true, track_positions: true, max_lookahead: 1024 }
192    }
193}
194
195/// Mode-aware Perl lexer
196pub struct PerlLexer<'a> {
197    input: &'a str,
198    /// Cached input bytes for faster access
199    input_bytes: &'a [u8],
200    position: usize,
201    mode: LexerMode,
202    config: LexerConfig,
203    /// Stack for nested delimiters in s{}{} constructs
204    delimiter_stack: Vec<char>,
205    /// Track if we're inside prototype parens after 'sub'
206    in_prototype: bool,
207    /// Paren depth to track when we exit prototype
208    prototype_depth: usize,
209    /// Current position with line/column tracking
210    #[allow(dead_code)]
211    current_pos: Position,
212    /// Track if we just skipped a newline (for __DATA__/__END__ detection)
213    after_newline: bool,
214    /// Queue of pending heredocs waiting for their bodies
215    pending_heredocs: Vec<HeredocSpec>,
216    /// Track the byte offset of the current line's start
217    line_start_offset: usize,
218    /// If true, emit `HeredocBody` tokens; otherwise just consume them.
219    emit_heredoc_body_tokens: bool,
220    /// Current quote operator being parsed
221    current_quote_op: Option<quote_handler::QuoteOperatorInfo>,
222    /// Track if EOF has been emitted to prevent infinite loops
223    eof_emitted: bool,
224    /// Start time for timeout protection
225    start_time: std::time::Instant,
226}
227
228impl<'a> PerlLexer<'a> {
229    /// Create a new lexer for the given input
230    pub fn new(input: &'a str) -> Self {
231        Self::with_config(input, LexerConfig::default())
232    }
233
234    /// Create a new lexer with custom configuration
235    pub fn with_config(input: &'a str, config: LexerConfig) -> Self {
236        Self {
237            input,
238            input_bytes: input.as_bytes(),
239            position: 0,
240            mode: LexerMode::ExpectTerm,
241            config,
242            delimiter_stack: Vec::new(),
243            in_prototype: false,
244            prototype_depth: 0,
245            current_pos: Position::start(),
246            after_newline: true, // Start of file counts as after newline
247            pending_heredocs: Vec::new(),
248            line_start_offset: 0,
249            emit_heredoc_body_tokens: false,
250            current_quote_op: None,
251            eof_emitted: false,
252            start_time: std::time::Instant::now(),
253        }
254    }
255
256    /// Create a new lexer that emits `HeredocBody` tokens (for LSP folding)
257    pub fn with_body_tokens(input: &'a str) -> Self {
258        let mut lexer = Self::new(input);
259        lexer.emit_heredoc_body_tokens = true;
260        lexer
261    }
262
263    /// Normalize file start by skipping BOM if present
264    fn normalize_file_start(&mut self) {
265        // Skip UTF-8 BOM (EF BB BF) if at file start
266        if self.position == 0 && self.matches_bytes(&[0xEF, 0xBB, 0xBF]) {
267            self.position = 3;
268            self.line_start_offset = 3;
269        }
270    }
271
272    /// Set the lexer mode (for resetting state at statement boundaries)
273    pub fn set_mode(&mut self, mode: LexerMode) {
274        self.mode = mode;
275    }
276
277    /// Helper to check if remaining bytes on a line are only spaces/tabs
278    #[inline]
279    fn trailing_ws_only(bytes: &[u8], mut p: usize) -> bool {
280        while p < bytes.len() && bytes[p] != b'\n' && bytes[p] != b'\r' {
281            match bytes[p] {
282                b' ' | b'\t' => p += 1,
283                _ => return false,
284            }
285        }
286        true
287    }
288
289    /// Consume a newline sequence (CRLF or LF) and update state
290    #[inline]
291    fn consume_newline(&mut self) {
292        if self.position >= self.input.len() {
293            return;
294        }
295        match self.input_bytes[self.position] {
296            b'\r' => {
297                self.position += 1;
298                if self.position < self.input.len() && self.input_bytes[self.position] == b'\n' {
299                    self.position += 1;
300                }
301            }
302            b'\n' => self.advance(),
303            _ => return, // not at a newline
304        }
305        self.after_newline = true;
306        self.line_start_offset = self.position;
307    }
308
309    /// Find the end of the current line, returning both raw end and visible end (without trailing CR)
310    #[inline]
311    fn find_line_end(bytes: &[u8], start: usize) -> (usize, usize) {
312        let mut end = start;
313        while end < bytes.len() && bytes[end] != b'\n' && bytes[end] != b'\r' {
314            end += 1;
315        }
316        // Visible end strips trailing \r if followed by \n
317        let visible_end = if end > start && end > 0 && bytes[end.saturating_sub(1)] == b'\r' {
318            end - 1
319        } else {
320            end
321        };
322        (end, visible_end)
323    }
324
325    /// Get the next token from the input
326    pub fn next_token(&mut self) -> Option<Token> {
327        // Normalize file start (BOM) once
328        if self.position == 0 {
329            self.normalize_file_start();
330        }
331
332        // Loop to avoid recursion when processing heredocs
333        loop {
334            // Handle format body parsing if we're in that mode
335            if matches!(self.mode, LexerMode::InFormatBody) {
336                return self.parse_format_body();
337            }
338
339            // Handle data section parsing if we're in that mode
340            if matches!(self.mode, LexerMode::InDataSection) {
341                return self.parse_data_body();
342            }
343
344            // Check if we're inside a heredoc body BEFORE skipping whitespace
345            let mut found_terminator = false;
346            if !self.pending_heredocs.is_empty() {
347                // Clone what we need to avoid holding a borrow
348                let (body_start, label, allow_indent) =
349                    if let Some(spec) = self.pending_heredocs.first() {
350                        if spec.body_start > 0
351                            && self.position >= spec.body_start
352                            && self.position < self.input.len()
353                        {
354                            (spec.body_start, spec.label.clone(), spec.allow_indent)
355                        } else {
356                            // Not in a heredoc body yet or at EOF
357                            (0, empty_arc(), false)
358                        }
359                    } else {
360                        (0, empty_arc(), false)
361                    };
362
363                if body_start > 0 {
364                    // We're inside a heredoc body - scan for the terminator
365
366                    // Scan line by line looking for the terminator
367                    while self.position < self.input.len() {
368                        // Timeout protection (Issue #443)
369                        if self.start_time.elapsed().as_millis() > HEREDOC_TIMEOUT_MS as u128 {
370                            self.pending_heredocs.remove(0);
371                            self.position = self.input.len();
372                            return Some(Token {
373                                token_type: TokenType::Error(Arc::from("Heredoc parsing timeout")),
374                                text: Arc::from(&self.input[body_start..]),
375                                start: body_start,
376                                end: self.input.len(),
377                            });
378                        }
379
380                        // Budget cap for huge bodies - optimized check
381                        if self.position - body_start > MAX_HEREDOC_BYTES {
382                            // Remove the pending heredoc to avoid infinite loop
383                            self.pending_heredocs.remove(0);
384                            self.position = self.input.len();
385                            return Some(Token {
386                                token_type: TokenType::UnknownRest,
387                                text: Arc::from(&self.input[body_start..]),
388                                start: body_start,
389                                end: self.input.len(),
390                            });
391                        }
392
393                        // Skip to start of next line if not at line start
394                        // Exception: if we're at body_start exactly, we're at the heredoc body start
395                        if !self.after_newline && self.position != body_start {
396                            while self.position < self.input.len()
397                                && self.input_bytes[self.position] != b'\n'
398                                && self.input_bytes[self.position] != b'\r'
399                            {
400                                self.advance();
401                            }
402                            self.consume_newline();
403                            continue;
404                        }
405
406                        // We're at line start - check if this line is the terminator
407                        let line_start = self.position;
408                        let (line_end, line_visible_end) =
409                            Self::find_line_end(self.input_bytes, self.position);
410                        let line = &self.input[line_start..line_visible_end];
411                        // Strip trailing spaces/tabs (Perl allows them)
412                        let trimmed_end = line.trim_end_matches([' ', '\t']);
413
414                        // Check if this line is the terminator
415                        let is_terminator = if allow_indent {
416                            // Allow any leading spaces/tabs before the label
417                            let mut p = 0;
418                            while p < trimmed_end.len() {
419                                let b = trimmed_end.as_bytes()[p];
420                                if b == b' ' || b == b'\t' {
421                                    p += 1;
422                                } else {
423                                    break;
424                                }
425                            }
426                            trimmed_end[p..] == *label
427                        } else {
428                            // Must start at column 0 (no leading whitespace)
429                            // The terminator is just the label (already trimmed trailing whitespace)
430                            trimmed_end == &*label
431                        };
432
433                        if is_terminator {
434                            // Found the terminator!
435                            self.pending_heredocs.remove(0);
436                            found_terminator = true;
437
438                            // Consume past the terminator line
439                            self.position = line_end;
440                            self.consume_newline();
441
442                            // Set body_start for the next pending heredoc (if any)
443                            if let Some(next) = self.pending_heredocs.first_mut()
444                                && next.body_start == 0
445                            {
446                                next.body_start = self.position;
447                            }
448
449                            // Only emit HeredocBody if requested (for folding)
450                            if self.emit_heredoc_body_tokens {
451                                return Some(Token {
452                                    token_type: TokenType::HeredocBody(empty_arc()),
453                                    text: empty_arc(),
454                                    start: body_start,
455                                    end: line_start,
456                                });
457                            }
458                            // Otherwise, continue the outer loop to get the next real token (avoiding recursion)
459                            break; // Break inner while loop, continue outer loop
460                        }
461
462                        // Not the terminator, continue to next line
463                        self.position = line_end;
464                        self.consume_newline();
465                    }
466
467                    // If we didn't find a terminator, we reached EOF - emit error token
468                    if !found_terminator {
469                        // Remove the pending heredoc to avoid infinite loop
470                        self.pending_heredocs.remove(0);
471                        self.position = self.input.len();
472                        return Some(Token {
473                            token_type: TokenType::UnknownRest,
474                            text: Arc::from(&self.input[body_start..]),
475                            start: body_start,
476                            end: self.input.len(),
477                        });
478                    }
479                }
480
481                // If we found a terminator, continue outer loop to get next token
482                if found_terminator {
483                    continue; // Continue outer loop to get next token
484                }
485            }
486
487            self.skip_whitespace_and_comments()?;
488
489            // Check again if we're now in a heredoc body (might have been set during skip_whitespace)
490            if !self.pending_heredocs.is_empty()
491                && let Some(spec) = self.pending_heredocs.first()
492                && spec.body_start > 0
493                && self.position >= spec.body_start
494                && self.position < self.input.len()
495            {
496                continue; // Go back to top of loop to process heredoc
497            }
498
499            // If we reach EOF with pending heredocs, clear them and emit EOF
500            if self.position >= self.input.len() && !self.pending_heredocs.is_empty() {
501                self.pending_heredocs.clear();
502            }
503
504            if self.position >= self.input.len() {
505                if self.eof_emitted {
506                    return None; // Stop the stream
507                }
508                self.eof_emitted = true;
509                return Some(Token {
510                    token_type: TokenType::EOF,
511                    text: empty_arc(),
512                    start: self.position,
513                    end: self.position,
514                });
515            }
516
517            let start = self.position;
518
519            // Check for special tokens first
520            if let Some(token) = self.try_heredoc() {
521                return Some(token);
522            }
523
524            if let Some(token) = self.try_string() {
525                return Some(token);
526            }
527
528            if let Some(token) = self.try_variable() {
529                return Some(token);
530            }
531
532            if let Some(token) = self.try_number() {
533                return Some(token);
534            }
535
536            if let Some(token) = self.try_identifier_or_keyword() {
537                return Some(token);
538            }
539
540            // If we're expecting a delimiter for a quote operator, only try delimiter
541            if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
542                if let Some(token) = self.try_delimiter() {
543                    return Some(token);
544                }
545                // Do NOT fall through to try_operator / try_punct / etc.
546                // Clear state first so we don't spin
547                self.mode = LexerMode::ExpectOperator;
548                self.current_quote_op = None;
549                continue;
550            }
551
552            if let Some(token) = self.try_operator() {
553                return Some(token);
554            }
555
556            if let Some(token) = self.try_delimiter() {
557                return Some(token);
558            }
559
560            // If nothing else matches, return an error token
561            let ch = self.current_char()?;
562            self.advance();
563
564            // Optimize error token creation - avoid expensive formatting in hot path
565            let text = if ch.is_ascii() {
566                // Fast path for ASCII characters
567                Arc::from(&self.input[start..self.position])
568            } else {
569                // Slower path for Unicode
570                Arc::from(ch.to_string())
571            };
572
573            return Some(Token {
574                token_type: TokenType::Error(Arc::from("Unexpected character")),
575                text,
576                start,
577                end: self.position,
578            });
579        } // End of loop
580    }
581
582    /// Budget guard to prevent infinite loops and timeouts (Issue #422)
583    ///
584    /// **Purpose**: Protect against pathological input that could cause:
585    /// - Infinite loops in regex/heredoc parsing
586    /// - Excessive memory consumption
587    /// - LSP server hangs
588    ///
589    /// **Limits**:
590    /// - `MAX_REGEX_BYTES` (64KB): Maximum bytes in a single regex literal
591    /// - `MAX_DELIM_NEST` (128): Maximum delimiter nesting depth
592    ///
593    /// **Graceful Degradation**:
594    /// - Budget exceeded → emit `UnknownRest` token
595    /// - Jump to EOF to prevent further parsing of problematic region
596    /// - LSP client can emit soft diagnostic about truncation
597    /// - All previously parsed symbols remain valid
598    ///
599    /// **Performance**:
600    /// - Fast path: inlined subtraction + comparison (~1-2 CPU cycles)
601    /// - Slow path: Only triggered on pathological input
602    /// - Amortized cost: O(1) per token
603    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
604    #[inline(always)]
605    fn budget_guard(&mut self, start: usize, depth: usize) -> Option<Token> {
606        // Fast path: most calls won't hit limits
607        let bytes_consumed = self.position - start;
608        if bytes_consumed <= MAX_REGEX_BYTES && depth <= MAX_DELIM_NEST {
609            return None;
610        }
611
612        // Slow path: budget exceeded - graceful degradation
613        // Note: In production LSP, this event could be logged/metered for monitoring
614        #[cfg(debug_assertions)]
615        {
616            eprintln!(
617                "Budget exceeded: bytes={}, depth={}, at position={}",
618                bytes_consumed, depth, self.position
619            );
620        }
621
622        self.position = self.input.len();
623        Some(Token {
624            token_type: TokenType::UnknownRest,
625            text: Arc::from(""),
626            start,
627            end: self.position,
628        })
629    }
630
631    /// Peek at the next token without consuming it
632    pub fn peek_token(&mut self) -> Option<Token> {
633        let saved_pos = self.position;
634        let saved_mode = self.mode;
635        let saved_prototype = self.in_prototype;
636        let saved_depth = self.prototype_depth;
637        let saved_after_newline = self.after_newline;
638
639        let token = self.next_token();
640
641        self.position = saved_pos;
642        self.mode = saved_mode;
643        self.in_prototype = saved_prototype;
644        self.prototype_depth = saved_depth;
645        self.after_newline = saved_after_newline;
646
647        token
648    }
649
650    /// Get all remaining tokens
651    pub fn collect_tokens(&mut self) -> Vec<Token> {
652        let mut tokens = Vec::new();
653        while let Some(token) = self.next_token() {
654            if token.token_type == TokenType::EOF {
655                tokens.push(token);
656                break;
657            }
658            tokens.push(token);
659        }
660        tokens
661    }
662
663    /// Reset the lexer to the beginning
664    pub fn reset(&mut self) {
665        self.position = 0;
666        self.mode = LexerMode::ExpectTerm;
667        self.delimiter_stack.clear();
668        self.in_prototype = false;
669        self.prototype_depth = 0;
670        self.after_newline = true;
671        self.pending_heredocs.clear();
672        self.line_start_offset = 0;
673    }
674
675    /// Switch lexer to format body parsing mode
676    pub fn enter_format_mode(&mut self) {
677        self.mode = LexerMode::InFormatBody;
678    }
679
680    // Internal helper methods
681
682    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
683    #[inline(always)]
684    fn byte_at(bytes: &[u8], index: usize) -> u8 {
685        debug_assert!(index < bytes.len());
686        match bytes.get(index) {
687            Some(&byte) => byte,
688            None => 0,
689        }
690    }
691
692    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
693    #[inline(always)]
694    fn current_char(&self) -> Option<char> {
695        if self.position < self.input_bytes.len() {
696            // For ASCII, direct access is safe
697            let byte = Self::byte_at(self.input_bytes, self.position);
698            if byte < 128 {
699                Some(byte as char)
700            } else {
701                // For non-ASCII, fall back to proper UTF-8 parsing
702                self.input.get(self.position..).and_then(|s| s.chars().next())
703            }
704        } else {
705            None
706        }
707    }
708
709    #[inline(always)]
710    fn peek_char(&self, offset: usize) -> Option<char> {
711        if offset > self.config.max_lookahead {
712            return None;
713        }
714
715        let pos = self.position.checked_add(offset)?;
716        if pos < self.input_bytes.len() {
717            // For ASCII, direct access is safe
718            let byte = Self::byte_at(self.input_bytes, pos);
719            if byte < 128 {
720                Some(byte as char)
721            } else {
722                // For non-ASCII, use chars iterator
723                self.input.get(self.position..).and_then(|s| s.chars().nth(offset))
724            }
725        } else {
726            None
727        }
728    }
729
730    #[allow(clippy::inline_always)] // Performance critical in lexer hot path
731    #[inline(always)]
732    fn advance(&mut self) {
733        if self.position < self.input_bytes.len() {
734            let byte = Self::byte_at(self.input_bytes, self.position);
735            if byte < 128 {
736                // ASCII fast path
737                self.position += 1;
738            } else if let Some(ch) = self.input.get(self.position..).and_then(|s| s.chars().next())
739            {
740                self.position += ch.len_utf8();
741            }
742        }
743    }
744
745    /// Fast byte-level check for ASCII characters
746    #[inline]
747    fn peek_byte(&self, offset: usize) -> Option<u8> {
748        if offset > self.config.max_lookahead {
749            return None;
750        }
751
752        let pos = self.position.checked_add(offset)?;
753        if pos < self.input_bytes.len() { Some(self.input_bytes[pos]) } else { None }
754    }
755
756    /// Check if the next bytes match a pattern (ASCII only)
757    #[inline]
758    fn matches_bytes(&self, pattern: &[u8]) -> bool {
759        let Some(end_offset) = pattern.len().checked_sub(1) else {
760            return true;
761        };
762
763        if end_offset > self.config.max_lookahead {
764            return false;
765        }
766
767        let Some(end) = self.position.checked_add(pattern.len()) else {
768            return false;
769        };
770
771        if end <= self.input_bytes.len() {
772            &self.input_bytes[self.position..end] == pattern
773        } else {
774            false
775        }
776    }
777
778    #[inline]
779    fn skip_whitespace_and_comments(&mut self) -> Option<()> {
780        // Don't reset after_newline if we're at the start of a line
781        if self.position > 0 && self.position != self.line_start_offset {
782            self.after_newline = false;
783        }
784
785        while self.position < self.input_bytes.len() {
786            let byte = Self::byte_at(self.input_bytes, self.position);
787            match byte {
788                // Fast path for ASCII whitespace - batch process
789                b' ' => {
790                    // Batch skip spaces for better cache efficiency
791                    let start = self.position;
792                    while self.position < self.input_bytes.len()
793                        && Self::byte_at(self.input_bytes, self.position) == b' '
794                    {
795                        self.position += 1;
796                    }
797                    // Continue outer loop if we processed any spaces
798                    if self.position > start {
799                        // Loop naturally continues to next iteration
800                    }
801                }
802                b'\t' => {
803                    // Batch skip tabs
804                    let start = self.position;
805                    while self.position < self.input_bytes.len()
806                        && Self::byte_at(self.input_bytes, self.position) == b'\t'
807                    {
808                        self.position += 1;
809                    }
810                    if self.position > start {
811                        // Loop naturally continues to next iteration
812                    }
813                }
814                b'\r' | b'\n' => {
815                    self.consume_newline();
816
817                    // Set body_start for the FIRST pending heredoc that needs it (FIFO)
818                    // Only check if we have pending heredocs to avoid unnecessary work
819                    if !self.pending_heredocs.is_empty() {
820                        for spec in &mut self.pending_heredocs {
821                            if spec.body_start == 0 {
822                                spec.body_start = self.position;
823                                break; // Only set for the first unresolved heredoc
824                            }
825                        }
826                    }
827                }
828                b'#' => {
829                    // In ExpectDelimiter mode, '#' is a delimiter, not a comment
830                    if matches!(self.mode, LexerMode::ExpectDelimiter) {
831                        break;
832                    }
833
834                    // Skip line comment using memchr for fast newline search
835                    self.position += 1; // Skip # directly
836
837                    // Use memchr to find newline quickly
838                    if let Some(newline_offset) =
839                        memchr::memchr(b'\n', &self.input_bytes[self.position..])
840                    {
841                        self.position += newline_offset;
842                    } else {
843                        // No newline found, skip to end
844                        self.position = self.input_bytes.len();
845                    }
846                }
847                b'=' if self.position == 0
848                    || (self.position > 0 && self.input_bytes[self.position - 1] == b'\n') =>
849                {
850                    // Check if this starts a POD section (=pod, =head, =over, etc.)
851                    // Use byte-safe checks — avoid slicing &str at arbitrary byte positions
852                    let remaining = &self.input_bytes[self.position..];
853                    if remaining.starts_with(b"=pod")
854                        || remaining.starts_with(b"=head")
855                        || remaining.starts_with(b"=over")
856                        || remaining.starts_with(b"=item")
857                        || remaining.starts_with(b"=back")
858                        || remaining.starts_with(b"=begin")
859                        || remaining.starts_with(b"=end")
860                        || remaining.starts_with(b"=for")
861                        || remaining.starts_with(b"=encoding")
862                    {
863                        // Scan forward for \n=cut (end of POD block)
864                        let search_start = self.position;
865                        let mut found_cut = false;
866                        let bytes = self.input_bytes;
867                        let mut i = search_start;
868                        while i < bytes.len() {
869                            // Look for =cut at the start of a line
870                            if (i == 0 || bytes[i - 1] == b'\n') && bytes[i..].starts_with(b"=cut")
871                            {
872                                i += 4; // Skip "=cut"
873                                // Skip rest of the =cut line
874                                while i < bytes.len() && bytes[i] != b'\n' {
875                                    i += 1;
876                                }
877                                // Consume the trailing newline if present
878                                if i < bytes.len() && bytes[i] == b'\n' {
879                                    i += 1;
880                                }
881                                self.position = i;
882                                found_cut = true;
883                                break;
884                            }
885                            i += 1;
886                        }
887                        if !found_cut {
888                            // POD extends to end of file
889                            self.position = bytes.len();
890                        }
891                        continue;
892                    }
893                    // Not a POD directive - regular '=' token
894                    break;
895                }
896                _ => {
897                    // For non-ASCII whitespace, use char check only when needed
898                    if byte >= 128
899                        && let Some(ch) = self.current_char()
900                        && ch.is_whitespace()
901                    {
902                        self.advance();
903                        continue;
904                    }
905                    break;
906                }
907            }
908        }
909        Some(())
910    }
911
912    fn try_heredoc(&mut self) -> Option<Token> {
913        // Check for heredoc start
914        if self.peek_byte(0) != Some(b'<') || self.peek_byte(1) != Some(b'<') {
915            return None;
916        }
917
918        let start = self.position;
919        let mut text = String::from("<<");
920        self.position += 2; // Skip <<
921
922        // Check for indented heredoc (~)
923        let allow_indent = if self.current_char() == Some('~') {
924            text.push('~');
925            self.advance();
926            true
927        } else {
928            false
929        };
930
931        // Skip whitespace
932        while let Some(ch) = self.current_char() {
933            if ch == ' ' || ch == '\t' {
934                text.push(ch);
935                self.advance();
936            } else {
937                break;
938            }
939        }
940
941        // Optional backslash disables interpolation, treat like single-quoted label
942        let backslashed = if self.current_char() == Some('\\') {
943            text.push('\\');
944            self.advance();
945            true
946        } else {
947            false
948        };
949
950        // Parse delimiter
951        let delimiter = if self.position < self.input.len() {
952            match self.current_char() {
953                Some('"') if !backslashed => {
954                    // Double-quoted delimiter
955                    text.push('"');
956                    self.advance();
957                    let mut delim = String::new();
958                    while self.position < self.input.len() {
959                        if let Some(ch) = self.current_char() {
960                            if ch == '"' {
961                                text.push('"');
962                                self.advance();
963                                break;
964                            }
965                            delim.push(ch);
966                            text.push(ch);
967                            self.advance();
968                        } else {
969                            break;
970                        }
971                    }
972                    delim
973                }
974                Some('\'') if !backslashed => {
975                    // Single-quoted delimiter
976                    text.push('\'');
977                    self.advance();
978                    let mut delim = String::new();
979                    while self.position < self.input.len() {
980                        if let Some(ch) = self.current_char() {
981                            if ch == '\'' {
982                                text.push('\'');
983                                self.advance();
984                                break;
985                            }
986                            delim.push(ch);
987                            text.push(ch);
988                            self.advance();
989                        } else {
990                            break;
991                        }
992                    }
993                    delim
994                }
995                Some('`') if !backslashed => {
996                    // Backtick delimiter
997                    text.push('`');
998                    self.advance();
999                    let mut delim = String::new();
1000                    while self.position < self.input.len() {
1001                        if let Some(ch) = self.current_char() {
1002                            if ch == '`' {
1003                                text.push('`');
1004                                self.advance();
1005                                break;
1006                            }
1007                            delim.push(ch);
1008                            text.push(ch);
1009                            self.advance();
1010                        } else {
1011                            break;
1012                        }
1013                    }
1014                    delim
1015                }
1016                Some(c) if is_perl_identifier_start(c) => {
1017                    // Bare word delimiter
1018                    let mut delim = String::new();
1019                    while self.position < self.input.len() {
1020                        if let Some(c) = self.current_char() {
1021                            if is_perl_identifier_continue(c) {
1022                                delim.push(c);
1023                                text.push(c);
1024                                self.advance();
1025                            } else {
1026                                break;
1027                            }
1028                        } else {
1029                            break;
1030                        }
1031                    }
1032                    delim
1033                }
1034                _ => {
1035                    // Not a valid heredoc delimiter - reset position and return None
1036                    // This allows << to be parsed as bitshift operator (e.g., 1 << 2)
1037                    self.position = start;
1038                    return None;
1039                }
1040            }
1041        } else {
1042            // No delimiter found - reset position and return None
1043            self.position = start;
1044            return None;
1045        };
1046
1047        // For now, return a placeholder token
1048        // The actual heredoc body would be parsed later when we encounter it
1049        self.mode = LexerMode::ExpectOperator;
1050
1051        // Recursion depth limit (Issue #443)
1052        if self.pending_heredocs.len() >= MAX_HEREDOC_DEPTH {
1053            return Some(Token {
1054                token_type: TokenType::Error(Arc::from("Heredoc nesting too deep")),
1055                text: Arc::from(text),
1056                start,
1057                end: self.position,
1058            });
1059        }
1060
1061        // Queue the heredoc spec with its label
1062        self.pending_heredocs.push(HeredocSpec {
1063            label: Arc::from(delimiter.as_str()),
1064            body_start: 0, // Will be set when we see the newline after this line
1065            allow_indent,
1066        });
1067
1068        Some(Token {
1069            token_type: TokenType::HeredocStart,
1070            text: Arc::from(text),
1071            start,
1072            end: self.position,
1073        })
1074    }
1075
1076    fn try_string(&mut self) -> Option<Token> {
1077        let start = self.position;
1078        let quote = self.current_char()?;
1079
1080        match quote {
1081            '"' => self.parse_double_quoted_string(start),
1082            '\'' => self.parse_single_quoted_string(start),
1083            '`' => self.parse_backtick_string(start),
1084            'q' if self.peek_char(1) == Some('{') => self.parse_q_string(start),
1085            _ => None,
1086        }
1087    }
1088
1089    #[inline]
1090    fn try_number(&mut self) -> Option<Token> {
1091        let start = self.position;
1092
1093        // Fast byte check for digits - optimized bounds checking
1094        let bytes = self.input_bytes;
1095        if self.position >= bytes.len() || !Self::byte_at(bytes, self.position).is_ascii_digit() {
1096            return None;
1097        }
1098
1099        // Check for hex (0x), binary (0b), or octal (0o) prefixes
1100        let mut pos = self.position;
1101        if Self::byte_at(bytes, pos) == b'0' && pos + 1 < bytes.len() {
1102            let prefix_byte = bytes[pos + 1];
1103            if prefix_byte == b'x' || prefix_byte == b'X' {
1104                // Hexadecimal: 0x[0-9a-fA-F_]+
1105                pos += 2; // consume '0x'
1106                let digit_start = pos;
1107                while pos < bytes.len() && (bytes[pos].is_ascii_hexdigit() || bytes[pos] == b'_') {
1108                    pos += 1;
1109                }
1110                if pos > digit_start {
1111                    self.position = pos;
1112                    let text = &self.input[start..self.position];
1113                    self.mode = LexerMode::ExpectOperator;
1114                    return Some(Token {
1115                        token_type: TokenType::Number(Arc::from(text)),
1116                        text: Arc::from(text),
1117                        start,
1118                        end: self.position,
1119                    });
1120                }
1121                // No hex digits after 0x - fall through to parse '0' as decimal
1122            } else if prefix_byte == b'b' || prefix_byte == b'B' {
1123                // Binary: 0b[01_]+
1124                pos += 2; // consume '0b'
1125                let digit_start = pos;
1126                while pos < bytes.len()
1127                    && (bytes[pos] == b'0' || bytes[pos] == b'1' || bytes[pos] == b'_')
1128                {
1129                    pos += 1;
1130                }
1131                if pos > digit_start {
1132                    self.position = pos;
1133                    let text = &self.input[start..self.position];
1134                    self.mode = LexerMode::ExpectOperator;
1135                    return Some(Token {
1136                        token_type: TokenType::Number(Arc::from(text)),
1137                        text: Arc::from(text),
1138                        start,
1139                        end: self.position,
1140                    });
1141                }
1142                // No binary digits after 0b - fall through to parse '0' as decimal
1143            } else if prefix_byte == b'o' || prefix_byte == b'O' {
1144                // Octal (explicit): 0o[0-7_]+
1145                pos += 2; // consume '0o'
1146                let digit_start = pos;
1147                while pos < bytes.len()
1148                    && ((bytes[pos] >= b'0' && bytes[pos] <= b'7') || bytes[pos] == b'_')
1149                {
1150                    pos += 1;
1151                }
1152                if pos > digit_start {
1153                    self.position = pos;
1154                    let text = &self.input[start..self.position];
1155                    self.mode = LexerMode::ExpectOperator;
1156                    return Some(Token {
1157                        token_type: TokenType::Number(Arc::from(text)),
1158                        text: Arc::from(text),
1159                        start,
1160                        end: self.position,
1161                    });
1162                }
1163                // No octal digits after 0o - fall through to parse '0' as decimal
1164            }
1165        }
1166
1167        // Consume initial digits - unrolled for better performance
1168        pos = self.position;
1169        while pos < bytes.len() {
1170            let byte = Self::byte_at(bytes, pos);
1171            if byte.is_ascii_digit() || byte == b'_' {
1172                pos += 1;
1173            } else {
1174                break;
1175            }
1176        }
1177        self.position = pos;
1178
1179        // Check for decimal point - optimized with single bounds check
1180        if pos < bytes.len() && Self::byte_at(bytes, pos) == b'.' {
1181            // Peek ahead to see what follows the dot
1182            let has_following_digit = pos + 1 < bytes.len() && bytes[pos + 1].is_ascii_digit();
1183
1184            // Optimized dot consumption logic
1185            let should_consume_dot = has_following_digit || {
1186                pos + 1 >= bytes.len() || {
1187                    // Use bitwise operations for faster character classification
1188                    let next_byte = bytes[pos + 1];
1189                    // Whitespace, delimiters, operators - optimized check
1190                    next_byte <= b' '
1191                        || matches!(
1192                            next_byte,
1193                            b';' | b','
1194                                | b')'
1195                                | b'}'
1196                                | b']'
1197                                | b'+'
1198                                | b'-'
1199                                | b'*'
1200                                | b'/'
1201                                | b'%'
1202                                | b'='
1203                                | b'<'
1204                                | b'>'
1205                                | b'!'
1206                                | b'&'
1207                                | b'|'
1208                                | b'^'
1209                                | b'~'
1210                                | b'e'
1211                                | b'E'
1212                        )
1213                }
1214            };
1215
1216            if should_consume_dot {
1217                pos += 1; // consume the dot
1218                // Consume fractional digits - batch processing
1219                while pos < bytes.len() && (bytes[pos].is_ascii_digit() || bytes[pos] == b'_') {
1220                    pos += 1;
1221                }
1222                self.position = pos;
1223            }
1224        }
1225
1226        // Check for exponent - optimized
1227        if pos < bytes.len() && (bytes[pos] == b'e' || bytes[pos] == b'E') {
1228            let exp_start = pos;
1229            pos += 1; // consume 'e' or 'E'
1230
1231            // Check for optional sign
1232            if pos < bytes.len() && (bytes[pos] == b'+' || bytes[pos] == b'-') {
1233                pos += 1;
1234            }
1235
1236            // Must have at least one digit after exponent (underscores allowed between digits)
1237            let mut saw_digit = false;
1238            while pos < bytes.len() {
1239                let byte = bytes[pos];
1240                if byte.is_ascii_digit() {
1241                    saw_digit = true;
1242                    pos += 1;
1243                } else if byte == b'_' {
1244                    pos += 1;
1245                } else {
1246                    break;
1247                }
1248            }
1249
1250            // If no digits after exponent, backtrack
1251            if !saw_digit {
1252                pos = exp_start;
1253            }
1254
1255            self.position = pos;
1256        }
1257
1258        // Avoid string slicing for common number cases - use Arc::from directly on slice
1259        let text = &self.input[start..self.position];
1260        self.mode = LexerMode::ExpectOperator;
1261
1262        Some(Token {
1263            token_type: TokenType::Number(Arc::from(text)),
1264            text: Arc::from(text),
1265            start,
1266            end: self.position,
1267        })
1268    }
1269
1270    fn parse_decimal_number(&mut self, start: usize) -> Option<Token> {
1271        // We're at the dot, consume it
1272        self.advance();
1273
1274        // Parse the fractional part
1275        while self.position < self.input_bytes.len() {
1276            let byte = self.input_bytes[self.position];
1277            match byte {
1278                b'0'..=b'9' | b'_' => self.position += 1,
1279                b'e' | b'E' => {
1280                    // Handle scientific notation
1281                    self.advance();
1282                    if self.position < self.input_bytes.len() {
1283                        let next = self.input_bytes[self.position];
1284                        if next == b'+' || next == b'-' {
1285                            self.advance();
1286                        }
1287                    }
1288                    // Parse exponent digits (underscores allowed between digits)
1289                    let exponent_start = self.position;
1290                    let mut saw_digit = false;
1291                    while self.position < self.input_bytes.len() {
1292                        let byte = self.input_bytes[self.position];
1293                        if byte.is_ascii_digit() {
1294                            saw_digit = true;
1295                            self.position += 1;
1296                        } else if byte == b'_' {
1297                            self.position += 1;
1298                        } else {
1299                            break;
1300                        }
1301                    }
1302
1303                    // No digits after exponent marker, rewind so caller treats `e` as separate token.
1304                    if !saw_digit {
1305                        self.position = exponent_start.saturating_sub(1);
1306                    }
1307                    break;
1308                }
1309                _ => break,
1310            }
1311        }
1312
1313        let text = &self.input[start..self.position];
1314        self.mode = LexerMode::ExpectOperator;
1315
1316        Some(Token {
1317            token_type: TokenType::Number(Arc::from(text)),
1318            text: Arc::from(text),
1319            start,
1320            end: self.position,
1321        })
1322    }
1323
1324    fn try_variable(&mut self) -> Option<Token> {
1325        let start = self.position;
1326        let sigil = self.current_char()?;
1327
1328        match sigil {
1329            '$' | '@' | '%' | '*' => {
1330                // In ExpectOperator mode, treat % and * as operators rather than sigils
1331                if self.mode == LexerMode::ExpectOperator && matches!(sigil, '*' | '%') {
1332                    return None;
1333                }
1334                self.advance();
1335
1336                // Special case: After ->, sigils followed by { or [ should be tokenized separately
1337                // This is for postfix dereference like ->@*, ->%{}, ->@[]
1338                // We need to be careful with Unicode - check if we have enough bytes and valid char boundaries
1339                let check_arrow = self.position >= 3
1340                    && self.position.saturating_sub(1) <= self.input.len()
1341                    && self.input.is_char_boundary(self.position.saturating_sub(3))
1342                    && self.input.is_char_boundary(self.position.saturating_sub(1));
1343
1344                if check_arrow
1345                    && {
1346                        let saved = self.position;
1347                        self.position -= 3;
1348                        let arrow = self.matches_bytes(b"->");
1349                        self.position = saved;
1350                        arrow
1351                    }
1352                    && matches!(self.current_char(), Some('{' | '[' | '*'))
1353                {
1354                    // Just return the sigil
1355                    let text = &self.input[start..self.position];
1356                    self.mode = LexerMode::ExpectOperator;
1357
1358                    return Some(Token {
1359                        token_type: TokenType::Identifier(Arc::from(text)),
1360                        text: Arc::from(text),
1361                        start,
1362                        end: self.position,
1363                    });
1364                }
1365
1366                // Check for $# (array length operator)
1367                if sigil == '$' && self.current_char() == Some('#') {
1368                    self.advance(); // consume #
1369                    // Now parse the array name
1370                    while let Some(ch) = self.current_char() {
1371                        if is_perl_identifier_continue(ch) {
1372                            self.advance();
1373                        } else if ch == ':' && self.peek_char(1) == Some(':') {
1374                            // Package-qualified array name
1375                            self.advance();
1376                            self.advance();
1377                        } else {
1378                            break;
1379                        }
1380                    }
1381
1382                    let text = &self.input[start..self.position];
1383                    self.mode = LexerMode::ExpectOperator;
1384
1385                    return Some(Token {
1386                        token_type: TokenType::Identifier(Arc::from(text)),
1387                        text: Arc::from(text),
1388                        start,
1389                        end: self.position,
1390                    });
1391                }
1392
1393                // Check for special cases like ${^MATCH} or ${::{foo}} or *{$glob}
1394                if self.current_char() == Some('{') {
1395                    // Peek ahead to decide if we should consume the brace
1396                    let next_char = self.peek_char(1);
1397
1398                    // Check if this is a dereference like @{$ref} or @{[...]}
1399                    // If the next char suggests dereference, don't consume the brace
1400                    if sigil != '*'
1401                        && matches!(
1402                            next_char,
1403                            Some('$' | '@' | '%' | '*' | '&' | '[' | ' ' | '\t' | '\n' | '\r')
1404                        )
1405                    {
1406                        // This is a dereference, don't consume the brace
1407                        let text = &self.input[start..self.position];
1408                        self.mode = LexerMode::ExpectOperator;
1409
1410                        return Some(Token {
1411                            token_type: TokenType::Identifier(Arc::from(text)),
1412                            text: Arc::from(text),
1413                            start,
1414                            end: self.position,
1415                        });
1416                    }
1417
1418                    self.advance(); // consume {
1419
1420                    // Handle special variables with caret
1421                    if self.current_char() == Some('^') {
1422                        self.advance(); // consume ^
1423                        // Parse the special variable name
1424                        while let Some(ch) = self.current_char() {
1425                            if ch == '}' {
1426                                self.advance(); // consume }
1427                                break;
1428                            } else if is_perl_identifier_continue(ch) {
1429                                self.advance();
1430                            } else {
1431                                break;
1432                            }
1433                        }
1434                    }
1435                    // Handle stash access like $::{foo}
1436                    else if self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1437                        self.advance(); // consume first :
1438                        self.advance(); // consume second :
1439                        // Skip optional { and }
1440                        if self.current_char() == Some('{') {
1441                            self.advance();
1442                        }
1443                        // Parse the name
1444                        while let Some(ch) = self.current_char() {
1445                            if ch == '}' {
1446                                self.advance();
1447                                if self.current_char() == Some('}') {
1448                                    self.advance(); // consume closing } of ${...}
1449                                }
1450                                break;
1451                            } else if is_perl_identifier_continue(ch) {
1452                                self.advance();
1453                            } else {
1454                                break;
1455                            }
1456                        }
1457                    }
1458                    // Regular braced variable like ${foo} or glob like *{$glob}
1459                    else {
1460                        // Check if this is a dereference like ${$ref} or @{$ref} or @{[...]}
1461                        // If the next char is a sigil or other expression starter, we should stop here and let the parser handle it
1462                        // EXCEPT for globs - *{$glob} should be parsed as one token
1463                        // Also check for empty braces or EOF - in these cases we should split the tokens
1464                        if sigil != '*'
1465                            && (matches!(
1466                                self.current_char(),
1467                                Some(
1468                                    '$' | '@'
1469                                        | '%'
1470                                        | '*'
1471                                        | '&'
1472                                        | '['
1473                                        | ' '
1474                                        | '\t'
1475                                        | '\n'
1476                                        | '\r'
1477                                        | '}'
1478                                )
1479                            ) || self.current_char().is_none())
1480                        {
1481                            // This is a dereference or empty/invalid brace, backtrack
1482                            self.position = start + 1; // Just past the sigil
1483                            let text = &self.input[start..self.position];
1484                            self.mode = LexerMode::ExpectOperator;
1485
1486                            return Some(Token {
1487                                token_type: TokenType::Identifier(Arc::from(text)),
1488                                text: Arc::from(text),
1489                                start,
1490                                end: self.position,
1491                            });
1492                        }
1493
1494                        // For glob access, we need to consume everything inside braces
1495                        if sigil == '*' {
1496                            let mut brace_depth: usize = 1;
1497                            while let Some(ch) = self.current_char() {
1498                                if ch == '{' {
1499                                    brace_depth += 1;
1500                                } else if ch == '}' {
1501                                    brace_depth = brace_depth.saturating_sub(1);
1502                                    if brace_depth == 0 {
1503                                        self.advance(); // consume final }
1504                                        break;
1505                                    }
1506                                }
1507                                self.advance();
1508                            }
1509                        } else {
1510                            // Regular variable
1511                            while let Some(ch) = self.current_char() {
1512                                if ch == '}' {
1513                                    self.advance(); // consume }
1514                                    break;
1515                                } else if is_perl_identifier_continue(ch) {
1516                                    self.advance();
1517                                } else {
1518                                    break;
1519                                }
1520                            }
1521                        }
1522                    }
1523                }
1524                // Parse regular variable name
1525                else if let Some(ch) = self.current_char() {
1526                    if is_perl_identifier_start(ch) {
1527                        while let Some(ch) = self.current_char() {
1528                            if is_perl_identifier_continue(ch) {
1529                                self.advance();
1530                            } else {
1531                                break;
1532                            }
1533                        }
1534                        // Handle package-qualified segments like Foo::bar
1535                        while self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1536                            self.advance();
1537                            self.advance();
1538                            while let Some(ch) = self.current_char() {
1539                                if is_perl_identifier_continue(ch) {
1540                                    self.advance();
1541                                } else {
1542                                    break;
1543                                }
1544                            }
1545                        }
1546                    }
1547                    // Handle special punctuation variables
1548                    else if sigil == '$'
1549                        && matches!(
1550                            ch,
1551                            '?' | '!'
1552                                | '@'
1553                                | '&'
1554                                | '`'
1555                                | '\''
1556                                | '.'
1557                                | '/'
1558                                | '\\'
1559                                | '|'
1560                                | '+'
1561                                | '-'
1562                                | '['
1563                                | ']'
1564                                | '$'
1565                        )
1566                    {
1567                        self.advance(); // consume the special character
1568                    }
1569                    // Handle special array/hash punctuation variables
1570                    else if (sigil == '@' || sigil == '%') && matches!(ch, '+' | '-') {
1571                        self.advance(); // consume the + or -
1572                    }
1573                }
1574
1575                let text = &self.input[start..self.position];
1576                self.mode = LexerMode::ExpectOperator;
1577
1578                Some(Token {
1579                    token_type: TokenType::Identifier(Arc::from(text)),
1580                    text: Arc::from(text),
1581                    start,
1582                    end: self.position,
1583                })
1584            }
1585            _ => None,
1586        }
1587    }
1588
1589    /// Return next non-space char without consuming.
1590    fn peek_nonspace(&self) -> Option<char> {
1591        let mut i = self.position;
1592        while i < self.input.len() {
1593            let c = self.input.get(i..).and_then(|s| s.chars().next())?;
1594            if c.is_whitespace() {
1595                i += c.len_utf8();
1596                continue;
1597            }
1598            return Some(c);
1599        }
1600        None
1601    }
1602
1603    /// Is `c` a valid quote-like delimiter? (non-alnum, including paired)
1604    fn is_quote_delim(c: char) -> bool {
1605        // Quote delimiters are punctuation, but not whitespace or control characters
1606        !c.is_ascii_alphanumeric() && !c.is_whitespace() && !c.is_control()
1607    }
1608
1609    #[inline]
1610    fn try_identifier_or_keyword(&mut self) -> Option<Token> {
1611        let start = self.position;
1612        let ch = self.current_char()?;
1613
1614        if is_perl_identifier_start(ch) {
1615            // Special case: substitution/transliteration with single-quote delimiter
1616            // The single quote is considered an identifier continuation, so we need to
1617            // detect these operators before consuming it as part of an identifier.
1618            if ch == 's' && self.peek_char(1) == Some('\'') {
1619                self.advance(); // consume 's'
1620                return self.parse_substitution(start);
1621            } else if ch == 'y' && self.peek_char(1) == Some('\'') {
1622                self.advance(); // consume 'y'
1623                return self.parse_transliteration(start);
1624            } else if ch == 't' && self.peek_char(1) == Some('r') && self.peek_char(2) == Some('\'')
1625            {
1626                self.advance(); // consume 't'
1627                self.advance(); // consume 'r'
1628                return self.parse_transliteration(start);
1629            }
1630
1631            while let Some(ch) = self.current_char() {
1632                // Single quote is usually allowed inside Perl identifiers (legacy package separator),
1633                // but it can also be the delimiter for quote-like operators (q'..', qq'..', qr'..', m'..').
1634                // If we've already read one of those operator words, stop before consuming the quote
1635                // so the quote-operator path can handle it.
1636                if ch == '\''
1637                    && matches!(
1638                        &self.input[start..self.position],
1639                        "m" | "q" | "qq" | "qw" | "qx" | "qr"
1640                    )
1641                {
1642                    break;
1643                }
1644
1645                if is_perl_identifier_continue(ch) {
1646                    self.advance();
1647                } else {
1648                    break;
1649                }
1650            }
1651            // Handle package-qualified identifiers like Foo::bar
1652            while self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1653                // consume '::'
1654                self.advance();
1655                self.advance();
1656
1657                // consume following identifier segment if present
1658                if let Some(ch) = self.current_char()
1659                    && is_perl_identifier_start(ch)
1660                {
1661                    self.advance();
1662                    while let Some(ch) = self.current_char() {
1663                        if is_perl_identifier_continue(ch) {
1664                            self.advance();
1665                        } else {
1666                            break;
1667                        }
1668                    }
1669                }
1670            }
1671
1672            let text = &self.input[start..self.position];
1673
1674            // Check for __DATA__ and __END__ markers using exact match
1675            // Only recognize these in code channel, not inside data/format sections or heredocs
1676            let in_code_channel =
1677                !matches!(self.mode, LexerMode::InDataSection | LexerMode::InFormatBody)
1678                    && self.pending_heredocs.is_empty();
1679
1680            let marker = if in_code_channel {
1681                if text == "__DATA__" {
1682                    Some("__DATA__")
1683                } else if text == "__END__" {
1684                    Some("__END__")
1685                } else {
1686                    None
1687                }
1688            } else {
1689                None
1690            };
1691
1692            if let Some(marker_text) = marker {
1693                // These must be at the beginning of a line
1694                // Use the after_newline flag to determine if we're at line start
1695                if self.after_newline {
1696                    // Check if rest of line is only whitespace
1697                    // Only treat as data marker if line has no trailing junk
1698                    if Self::trailing_ws_only(self.input_bytes, self.position) {
1699                        // Consume the rest of the line (the marker line)
1700                        while self.position < self.input.len()
1701                            && self.input_bytes[self.position] != b'\n'
1702                        {
1703                            self.advance();
1704                        }
1705                        if self.position < self.input.len()
1706                            && self.input_bytes[self.position] == b'\n'
1707                        {
1708                            self.advance();
1709                        }
1710
1711                        // Switch to data section mode
1712                        self.mode = LexerMode::InDataSection;
1713
1714                        return Some(Token {
1715                            token_type: TokenType::DataMarker(Arc::from(marker_text)),
1716                            text: Arc::from(marker_text),
1717                            start,
1718                            end: self.position,
1719                        });
1720                    }
1721                }
1722            }
1723
1724            // Check for substitution/transliteration operators
1725            #[allow(clippy::collapsible_if)]
1726            if matches!(text, "s" | "tr" | "y") {
1727                if let Some(next) = self.current_char() {
1728                    // Check if followed by a delimiter
1729                    if matches!(
1730                        next,
1731                        '/' | '|'
1732                            | '\''
1733                            | '{'
1734                            | '['
1735                            | '('
1736                            | '<'
1737                            | '!'
1738                            | '#'
1739                            | '@'
1740                            | '$'
1741                            | '%'
1742                            | '^'
1743                            | '&'
1744                            | '*'
1745                            | '+'
1746                            | '='
1747                            | '~'
1748                            | '`'
1749                    ) {
1750                        match text {
1751                            "s" => {
1752                                return self.parse_substitution(start);
1753                            }
1754                            "tr" | "y" => {
1755                                return self.parse_transliteration(start);
1756                            }
1757                            unexpected => {
1758                                // Return diagnostic token instead of panicking
1759                                return Some(Token {
1760                                    token_type: TokenType::Error(Arc::from(format!(
1761                                        "Unexpected substitution operator '{}': expected 's', 'tr', or 'y' at position {}",
1762                                        unexpected, start
1763                                    ))),
1764                                    text: Arc::from(unexpected),
1765                                    start,
1766                                    end: self.position,
1767                                });
1768                            }
1769                        }
1770                    }
1771                }
1772            }
1773
1774            let token_type = if is_keyword(text) {
1775                // Check for special keywords that affect lexer mode
1776                match text {
1777                    "if" | "unless" | "while" | "until" | "for" | "foreach" | "grep" | "map"
1778                    | "sort" | "split" => {
1779                        self.mode = LexerMode::ExpectTerm;
1780                    }
1781                    "sub" => {
1782                        self.in_prototype = true;
1783                    }
1784                    // Quote operators expect a delimiter next (must be immediately adjacent)
1785                    op if quote_handler::is_quote_operator(op) => {
1786                        // For regex operators like 'm', 's', 'tr', 'y', delimiter must be immediately adjacent
1787                        // For quote operators like 'q', 'qq', 'qw', 'qr', 'qx', we allow whitespace
1788                        let next_char = if matches!(op, "m" | "s" | "tr" | "y") {
1789                            self.current_char() // Must be immediately adjacent
1790                        } else {
1791                            self.peek_nonspace() // Can skip whitespace
1792                        };
1793
1794                        if let Some(next) = next_char {
1795                            if Self::is_quote_delim(next) {
1796                                self.mode = LexerMode::ExpectDelimiter;
1797                                self.current_quote_op = Some(quote_handler::QuoteOperatorInfo {
1798                                    operator: op.to_string(),
1799                                    delimiter: '\0', // Will be set when we see the delimiter
1800                                    start_pos: start,
1801                                });
1802
1803                                // Don't return a keyword token - continue to parse the delimiter
1804                                // Skip any whitespace between operator and delimiter
1805                                while let Some(ch) = self.current_char() {
1806                                    if ch.is_whitespace() {
1807                                        self.advance();
1808                                    } else {
1809                                        break;
1810                                    }
1811                                }
1812
1813                                // Get the delimiter
1814                                #[allow(clippy::collapsible_if)]
1815                                if let Some(delim) = self.current_char() {
1816                                    if !delim.is_alphanumeric() {
1817                                        self.advance();
1818                                        if let Some(ref mut info) = self.current_quote_op {
1819                                            info.delimiter = delim;
1820                                        }
1821                                        // Parse the quote operator content and return the complete token
1822                                        return self.parse_quote_operator(delim);
1823                                    }
1824                                }
1825                            } else {
1826                                // Not a quote operator here → treat as IDENTIFIER
1827                                self.current_quote_op = None;
1828                                self.mode = LexerMode::ExpectOperator;
1829                                return Some(Token {
1830                                    token_type: TokenType::Identifier(Arc::from(text)),
1831                                    start,
1832                                    end: self.position,
1833                                    text: Arc::from(text),
1834                                });
1835                            }
1836                        } else {
1837                            // End-of-input after the word → also treat as IDENTIFIER
1838                            self.current_quote_op = None;
1839                            self.mode = LexerMode::ExpectOperator;
1840                            return Some(Token {
1841                                token_type: TokenType::Identifier(Arc::from(text)),
1842                                start,
1843                                end: self.position,
1844                                text: Arc::from(text),
1845                            });
1846                        }
1847                        // If we get here but haven't returned, something went wrong
1848                        // Fall through to treat as identifier
1849                        self.current_quote_op = None;
1850                        self.mode = LexerMode::ExpectOperator;
1851                        return Some(Token {
1852                            token_type: TokenType::Identifier(Arc::from(text)),
1853                            start,
1854                            end: self.position,
1855                            text: Arc::from(text),
1856                        });
1857                    }
1858                    // Format declarations need special handling
1859                    "format" => {
1860                        // We'll need to check for the = after the format name
1861                        // For now, just mark that we saw format
1862                    }
1863                    _ => {}
1864                }
1865                TokenType::Keyword(Arc::from(text))
1866            } else {
1867                self.mode = LexerMode::ExpectOperator;
1868                TokenType::Identifier(Arc::from(text))
1869            };
1870
1871            Some(Token { token_type, text: Arc::from(text), start, end: self.position })
1872        } else {
1873            None
1874        }
1875    }
1876
1877    /// Parse data section body - consumes everything to EOF
1878    fn parse_data_body(&mut self) -> Option<Token> {
1879        if self.position >= self.input.len() {
1880            // Already at EOF
1881            self.mode = LexerMode::ExpectTerm;
1882            return Some(Token {
1883                token_type: TokenType::EOF,
1884                text: Arc::from(""),
1885                start: self.position,
1886                end: self.position,
1887            });
1888        }
1889
1890        let start = self.position;
1891        // Consume everything to EOF
1892        let body = &self.input[self.position..];
1893        self.position = self.input.len();
1894
1895        // Reset mode for next parse (though we're at EOF)
1896        self.mode = LexerMode::ExpectTerm;
1897
1898        Some(Token {
1899            token_type: TokenType::DataBody(Arc::from(body)),
1900            text: Arc::from(body),
1901            start,
1902            end: self.position,
1903        })
1904    }
1905
1906    /// Parse format body - consumes until a line with just a dot
1907    fn parse_format_body(&mut self) -> Option<Token> {
1908        let start = self.position;
1909        let mut body = String::new();
1910        let mut line_start = true;
1911
1912        while self.position < self.input.len() {
1913            // Check if we're at the start of a line and the next char is a dot
1914            if line_start && self.current_char() == Some('.') {
1915                // Check if this line contains only a dot
1916                let mut peek_pos = self.position + 1;
1917                let mut found_terminator = true;
1918
1919                // Skip any trailing whitespace on the dot line
1920                while peek_pos < self.input.len() {
1921                    match self.input_bytes[peek_pos] {
1922                        b' ' | b'\t' | b'\r' => peek_pos += 1,
1923                        b'\n' => break,
1924                        _ => {
1925                            found_terminator = false;
1926                            break;
1927                        }
1928                    }
1929                }
1930
1931                if found_terminator {
1932                    // We found the terminating dot, consume it
1933                    self.position = peek_pos;
1934                    if self.position < self.input.len() && self.input_bytes[self.position] == b'\n'
1935                    {
1936                        self.position += 1;
1937                    }
1938
1939                    // Switch back to normal mode
1940                    self.mode = LexerMode::ExpectTerm;
1941
1942                    return Some(Token {
1943                        token_type: TokenType::FormatBody(Arc::from(body.clone())),
1944                        text: Arc::from(body),
1945                        start,
1946                        end: self.position,
1947                    });
1948                }
1949            }
1950
1951            // Not a terminator, consume the character
1952            match self.current_char() {
1953                Some(ch) => {
1954                    body.push(ch);
1955                    self.advance();
1956
1957                    // Track if we're at the start of a line
1958                    line_start = ch == '\n';
1959                }
1960                None => {
1961                    // Reached EOF without finding terminator
1962                    break;
1963                }
1964            }
1965        }
1966
1967        // If we reach here, we didn't find a terminator
1968        self.mode = LexerMode::ExpectTerm;
1969        Some(Token {
1970            token_type: TokenType::Error(Arc::from("Unterminated format body")),
1971            text: Arc::from(body),
1972            start,
1973            end: self.position,
1974        })
1975    }
1976
1977    fn try_operator(&mut self) -> Option<Token> {
1978        // Skip operator parsing if we're expecting a delimiter for a quote operator
1979        if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
1980            return None;
1981        }
1982
1983        let start = self.position;
1984        let ch = self.current_char()?;
1985
1986        // ═══════════════════════════════════════════════════════════════════════
1987        // SLASH DISAMBIGUATION STRATEGY (Issue #422)
1988        // ═══════════════════════════════════════════════════════════════════════
1989        //
1990        // Perl's `/` character is ambiguous:
1991        //   - Division operator: `$x / 2`
1992        //   - Regex delimiter: `/pattern/`
1993        //   - Defined-or operator: `$x // $y`
1994        //
1995        // **Disambiguation Strategy (Context-Aware Heuristics):**
1996        //
1997        // 1. **Mode-Based Decision (Primary)**:
1998        //    - `LexerMode::ExpectTerm` → `/` starts a regex
1999        //      Examples: `if (/pattern/)`, `=~ /test/`, `( /regex/`
2000        //    - `LexerMode::ExpectOperator` → `/` is division or `//`
2001        //      Examples: `$x / 2`, `$x // $y`, `) / 3`
2002        //
2003        // 2. **Context Heuristics (Secondary - Implicit in Mode)**:
2004        //    Mode is set based on previous token:
2005        //    - After identifier/number/closing paren → ExpectOperator → division
2006        //    - After operator/keyword/opening paren → ExpectTerm → regex
2007        //
2008        // 3. **Timeout Protection**:
2009        //    - Regex parsing has budget guard: MAX_REGEX_BYTES (64KB)
2010        //    - Budget exceeded → emit UnknownRest token (graceful degradation)
2011        //    - See `parse_regex()` and `budget_guard()` for implementation
2012        //
2013        // 4. **Performance Characteristics**:
2014        //    - Single-pass: O(1) decision based on mode flag
2015        //    - No backtracking: Mode updated after each token
2016        //    - Optimized: Byte-level operations for common cases
2017        //
2018        // **Metrics & Monitoring**:
2019        //    - Budget exceeded events tracked via UnknownRest token emission
2020        //    - LSP diagnostics generated for truncated regexes
2021        //    - Test coverage: lexer_slash_timeout_tests.rs (21 test cases)
2022        //
2023        // ═══════════════════════════════════════════════════════════════════════
2024
2025        if ch == '/' {
2026            if self.mode == LexerMode::ExpectTerm {
2027                // Mode indicates we're expecting a term → `/` starts a regex
2028                // Examples: `if (/pattern/)`, `=~ /test/`, `while (/match/)`
2029                return self.parse_regex(start);
2030            } else {
2031                // Mode indicates we're expecting an operator → `/` is division or `//`
2032                // Examples: `$x / 2`, `$x // $y`, `10 / 3`
2033                self.advance();
2034                // Check for // or //= using byte-level operations for speed
2035                if self.peek_byte(0) == Some(b'/') {
2036                    self.position += 1; // consume second / directly
2037                    if self.peek_byte(0) == Some(b'=') {
2038                        self.position += 1; // consume = directly
2039                        let text = &self.input[start..self.position];
2040                        self.mode = LexerMode::ExpectTerm;
2041                        return Some(Token {
2042                            token_type: TokenType::Operator(Arc::from(text)),
2043                            text: Arc::from(text),
2044                            start,
2045                            end: self.position,
2046                        });
2047                    } else {
2048                        // Use cached string for common "//" operator
2049                        self.mode = LexerMode::ExpectTerm;
2050                        return Some(Token {
2051                            token_type: TokenType::Operator(Arc::from("//")),
2052                            text: Arc::from("//"),
2053                            start,
2054                            end: self.position,
2055                        });
2056                    }
2057                } else if self.position < self.input_bytes.len()
2058                    && self.input_bytes[self.position] == b'='
2059                {
2060                    // /= division-assign operator
2061                    self.position += 1; // consume =
2062                    self.mode = LexerMode::ExpectTerm;
2063                    return Some(Token {
2064                        token_type: TokenType::Operator(Arc::from("/=")),
2065                        text: Arc::from("/="),
2066                        start,
2067                        end: self.position,
2068                    });
2069                } else {
2070                    // Use cached string for common "/" division
2071                    self.mode = LexerMode::ExpectTerm;
2072                    return Some(Token {
2073                        token_type: TokenType::Division,
2074                        text: Arc::from("/"),
2075                        start,
2076                        end: self.position,
2077                    });
2078                }
2079            }
2080        }
2081
2082        // Handle other operators - simplified
2083        match ch {
2084            '.' => {
2085                // Check if it's a decimal number like .5
2086                if self.peek_char(1).is_some_and(|c| c.is_ascii_digit()) {
2087                    return self.parse_decimal_number(start);
2088                }
2089                self.advance();
2090                // Check for compound operators
2091                #[allow(clippy::collapsible_if)]
2092                if let Some(next) = self.current_char() {
2093                    if is_compound_operator(ch, next) {
2094                        self.advance();
2095
2096                        // Check for three-character operators like **=, <<=, >>=
2097                        if self.position < self.input.len() {
2098                            let third = self.current_char();
2099                            // Check for three-character operators
2100                            if matches!(
2101                                (ch, next, third),
2102                                ('*', '*', Some('='))
2103                                    | ('<', '<', Some('='))
2104                                    | ('>', '>', Some('='))
2105                                    | ('&', '&', Some('='))
2106                                    | ('|', '|', Some('='))
2107                                    | ('/', '/', Some('='))
2108                            ) {
2109                                self.advance(); // consume the =
2110                            } else if ch == '<' && next == '=' && third == Some('>') {
2111                                self.advance(); // consume the >
2112                            // Special case: <=> spaceship operator
2113                            } else if ch == '.' && next == '.' && third == Some('.') {
2114                                self.advance(); // consume the third .
2115                            }
2116                        }
2117                    }
2118                }
2119            }
2120            '+' | '-' | '*' | '%' | '&' | '|' | '^' | '~' | '!' | '=' | '<' | '>' | ':' | '?'
2121            | '\\' => {
2122                self.advance();
2123                // Check for compound operators
2124                #[allow(clippy::collapsible_if)]
2125                if let Some(next) = self.current_char() {
2126                    if is_compound_operator(ch, next) {
2127                        self.advance();
2128
2129                        // Check for three-character operators like **=, <<=, >>=
2130                        if self.position < self.input.len() {
2131                            let third = self.current_char();
2132                            // Check for three-character operators
2133                            if matches!(
2134                                (ch, next, third),
2135                                ('*', '*', Some('='))
2136                                    | ('<', '<', Some('='))
2137                                    | ('>', '>', Some('='))
2138                                    | ('&', '&', Some('='))
2139                                    | ('|', '|', Some('='))
2140                                    | ('/', '/', Some('='))
2141                            ) {
2142                                self.advance(); // consume the =
2143                            } else if ch == '<' && next == '=' && third == Some('>') {
2144                                self.advance(); // consume the >
2145                                // Special case: <=> spaceship operator
2146                            }
2147                        }
2148                    }
2149                }
2150            }
2151            _ => return None,
2152        }
2153
2154        let text = &self.input[start..self.position];
2155        // Postfix ++ and -- complete a term expression, so next token is an operator
2156        // (e.g., "$x++ / 2" → / is division, not regex)
2157        if (text == "++" || text == "--") && self.mode == LexerMode::ExpectOperator {
2158            // Postfix: stay in ExpectOperator
2159        } else {
2160            self.mode = LexerMode::ExpectTerm;
2161        }
2162
2163        Some(Token {
2164            token_type: TokenType::Operator(Arc::from(text)),
2165            text: Arc::from(text),
2166            start,
2167            end: self.position,
2168        })
2169    }
2170
2171    fn try_delimiter(&mut self) -> Option<Token> {
2172        let start = self.position;
2173        let ch = self.current_char()?;
2174
2175        // If we're expecting a delimiter for a quote operator, handle it specially
2176        if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2177            // Accept any non-alphanumeric character as a delimiter
2178            if !ch.is_alphanumeric() && !ch.is_whitespace() {
2179                self.advance();
2180                if let Some(ref mut info) = self.current_quote_op {
2181                    info.delimiter = ch;
2182                }
2183                // Now parse the quote operator content
2184                return self.parse_quote_operator(ch);
2185            }
2186        }
2187
2188        match ch {
2189            '(' => {
2190                // Check if this is a quote operator delimiter
2191                if matches!(self.mode, LexerMode::ExpectDelimiter)
2192                    && self.current_quote_op.is_some()
2193                {
2194                    self.advance();
2195                    if let Some(ref mut info) = self.current_quote_op {
2196                        info.delimiter = ch;
2197                    }
2198                    return self.parse_quote_operator(ch);
2199                }
2200
2201                self.advance();
2202                if self.in_prototype {
2203                    self.prototype_depth += 1;
2204                }
2205                self.mode = LexerMode::ExpectTerm;
2206                Some(Token {
2207                    token_type: TokenType::LeftParen,
2208                    text: Arc::from("("),
2209                    start,
2210                    end: self.position,
2211                })
2212            }
2213            ')' => {
2214                self.advance();
2215                if self.in_prototype && self.prototype_depth > 0 {
2216                    self.prototype_depth -= 1;
2217                    if self.prototype_depth == 0 {
2218                        self.in_prototype = false;
2219                    }
2220                }
2221                self.mode = LexerMode::ExpectOperator;
2222                Some(Token {
2223                    token_type: TokenType::RightParen,
2224                    text: Arc::from(")"),
2225                    start,
2226                    end: self.position,
2227                })
2228            }
2229            ';' => {
2230                self.advance();
2231                self.mode = LexerMode::ExpectTerm;
2232                Some(Token {
2233                    token_type: TokenType::Semicolon,
2234                    text: Arc::from(";"),
2235                    start,
2236                    end: self.position,
2237                })
2238            }
2239            ',' => {
2240                self.advance();
2241                self.mode = LexerMode::ExpectTerm;
2242                Some(Token {
2243                    token_type: TokenType::Comma,
2244                    text: Arc::from(","),
2245                    start,
2246                    end: self.position,
2247                })
2248            }
2249            '[' => {
2250                self.advance();
2251                self.mode = LexerMode::ExpectTerm;
2252                Some(Token {
2253                    token_type: TokenType::LeftBracket,
2254                    text: Arc::from("["),
2255                    start,
2256                    end: self.position,
2257                })
2258            }
2259            ']' => {
2260                self.advance();
2261                self.mode = LexerMode::ExpectOperator;
2262                Some(Token {
2263                    token_type: TokenType::RightBracket,
2264                    text: Arc::from("]"),
2265                    start,
2266                    end: self.position,
2267                })
2268            }
2269            '{' => {
2270                self.advance();
2271                self.mode = LexerMode::ExpectTerm;
2272                Some(Token {
2273                    token_type: TokenType::LeftBrace,
2274                    text: Arc::from("{"),
2275                    start,
2276                    end: self.position,
2277                })
2278            }
2279            '}' => {
2280                self.advance();
2281                self.mode = LexerMode::ExpectOperator;
2282                Some(Token {
2283                    token_type: TokenType::RightBrace,
2284                    text: Arc::from("}"),
2285                    start,
2286                    end: self.position,
2287                })
2288            }
2289            '#' => {
2290                // Only treat as delimiter in ExpectDelimiter mode
2291                if matches!(self.mode, LexerMode::ExpectDelimiter) {
2292                    self.advance();
2293                    // Reset mode after consuming delimiter
2294                    self.mode = LexerMode::ExpectTerm;
2295                    Some(Token {
2296                        token_type: TokenType::Operator(Arc::from("#")),
2297                        text: Arc::from("#"),
2298                        start,
2299                        end: self.position,
2300                    })
2301                } else {
2302                    None
2303                }
2304            }
2305            _ => None,
2306        }
2307    }
2308
2309    fn parse_double_quoted_string(&mut self, start: usize) -> Option<Token> {
2310        self.advance(); // Skip opening quote
2311        let mut parts = Vec::new();
2312        let mut current_literal = String::new();
2313        let mut last_pos = self.position;
2314
2315        while let Some(ch) = self.current_char() {
2316            match ch {
2317                '"' => {
2318                    self.advance();
2319                    if !current_literal.is_empty() {
2320                        parts.push(StringPart::Literal(Arc::from(current_literal)));
2321                    }
2322
2323                    let text = &self.input[start..self.position];
2324                    self.mode = LexerMode::ExpectOperator;
2325
2326                    return Some(Token {
2327                        token_type: if parts.is_empty() {
2328                            TokenType::StringLiteral
2329                        } else {
2330                            TokenType::InterpolatedString(parts)
2331                        },
2332                        text: Arc::from(text),
2333                        start,
2334                        end: self.position,
2335                    });
2336                }
2337                '\\' => {
2338                    self.advance();
2339                    if let Some(escaped) = self.current_char() {
2340                        // Optimize by reserving space to avoid frequent reallocations
2341                        if current_literal.capacity() == 0 {
2342                            current_literal.reserve(32);
2343                        }
2344                        current_literal.push('\\');
2345                        current_literal.push(escaped);
2346                        self.advance();
2347                    }
2348                }
2349                '$' if self.config.parse_interpolation => {
2350                    // Handle variable interpolation - avoid unnecessary clone
2351                    if !current_literal.is_empty() {
2352                        parts.push(StringPart::Literal(Arc::from(current_literal)));
2353                        current_literal = String::new(); // Clear without cloning
2354                    }
2355
2356                    // Parse variable - optimized using byte-level checks where possible
2357                    self.advance();
2358                    let var_start = self.position;
2359
2360                    // Fast path for ASCII identifier continuation
2361                    while self.position < self.input_bytes.len() {
2362                        let byte = self.input_bytes[self.position];
2363                        if byte.is_ascii_alphanumeric() || byte == b'_' {
2364                            self.position += 1;
2365                        } else if byte >= 128 {
2366                            // Only use UTF-8 parsing for non-ASCII
2367                            if let Some(ch) = self.current_char() {
2368                                if is_perl_identifier_continue(ch) {
2369                                    self.advance();
2370                                } else {
2371                                    break;
2372                                }
2373                            } else {
2374                                break;
2375                            }
2376                        } else {
2377                            break;
2378                        }
2379                    }
2380
2381                    if self.position > var_start {
2382                        let var_name = &self.input[var_start - 1..self.position];
2383                        parts.push(StringPart::Variable(Arc::from(var_name)));
2384                    }
2385                }
2386                _ => {
2387                    // Optimize string building with better capacity management
2388                    if current_literal.capacity() == 0 {
2389                        current_literal.reserve(32);
2390                    }
2391                    current_literal.push(ch);
2392                    self.advance();
2393                }
2394            }
2395
2396            // Safety check: ensure we're making progress
2397            if self.position == last_pos {
2398                break;
2399            }
2400            last_pos = self.position;
2401        }
2402
2403        // Unterminated string - return error token consuming rest of input
2404        let end = self.input.len();
2405        self.position = end;
2406
2407        Some(Token {
2408            token_type: TokenType::Error(Arc::from("unterminated string")),
2409            text: Arc::from(&self.input[start..end]),
2410            start,
2411            end,
2412        })
2413    }
2414
2415    fn parse_single_quoted_string(&mut self, start: usize) -> Option<Token> {
2416        self.advance(); // Skip opening quote
2417
2418        let mut last_pos = self.position;
2419
2420        while let Some(ch) = self.current_char() {
2421            match ch {
2422                '\'' => {
2423                    self.advance();
2424                    let text = &self.input[start..self.position];
2425                    self.mode = LexerMode::ExpectOperator;
2426
2427                    return Some(Token {
2428                        token_type: TokenType::StringLiteral,
2429                        text: Arc::from(text),
2430                        start,
2431                        end: self.position,
2432                    });
2433                }
2434                '\\' => {
2435                    self.advance();
2436                    if self.current_char() == Some('\'') || self.current_char() == Some('\\') {
2437                        self.advance();
2438                    }
2439                }
2440                _ => self.advance(),
2441            }
2442
2443            // Safety check: ensure we're making progress
2444            if self.position == last_pos {
2445                break;
2446            }
2447            last_pos = self.position;
2448        }
2449
2450        // Unterminated string - return error token consuming rest of input
2451        let end = self.input.len();
2452        self.position = end;
2453
2454        Some(Token {
2455            token_type: TokenType::Error(Arc::from("unterminated string")),
2456            text: Arc::from(&self.input[start..end]),
2457            start,
2458            end,
2459        })
2460    }
2461
2462    fn parse_backtick_string(&mut self, start: usize) -> Option<Token> {
2463        self.advance(); // Skip opening backtick
2464
2465        let mut last_pos = self.position;
2466
2467        while let Some(ch) = self.current_char() {
2468            match ch {
2469                '`' => {
2470                    self.advance();
2471                    let text = &self.input[start..self.position];
2472                    self.mode = LexerMode::ExpectOperator;
2473
2474                    return Some(Token {
2475                        token_type: TokenType::QuoteCommand,
2476                        text: Arc::from(text),
2477                        start,
2478                        end: self.position,
2479                    });
2480                }
2481                '\\' => {
2482                    self.advance();
2483                    if self.current_char().is_some() {
2484                        self.advance();
2485                    }
2486                }
2487                _ => self.advance(),
2488            }
2489
2490            // Safety check: ensure we're making progress
2491            if self.position == last_pos {
2492                break;
2493            }
2494            last_pos = self.position;
2495        }
2496
2497        // Unterminated string - return error token consuming rest of input
2498        let end = self.input.len();
2499        self.position = end;
2500
2501        Some(Token {
2502            token_type: TokenType::Error(Arc::from("unterminated string")),
2503            text: Arc::from(&self.input[start..end]),
2504            start,
2505            end,
2506        })
2507    }
2508
2509    fn parse_q_string(&mut self, _start: usize) -> Option<Token> {
2510        // Simplified q-string parsing
2511        None
2512    }
2513
2514    /// Returns the closing delimiter for paired delimiters, or the same character for non-paired.
2515    /// This helper makes delimiter pairing explicit and avoids unreachable code paths.
2516    fn paired_closing(delim: char) -> char {
2517        match delim {
2518            '{' => '}',
2519            '[' => ']',
2520            '(' => ')',
2521            '<' => '>',
2522            _ => delim, // non-paired delimiters use the same character
2523        }
2524    }
2525
2526    fn parse_substitution(&mut self, start: usize) -> Option<Token> {
2527        // We've already consumed 's'
2528        let delimiter = self.current_char()?;
2529        self.advance(); // Skip delimiter
2530
2531        // Parse pattern
2532        let mut depth = 1;
2533        let is_paired = matches!(delimiter, '{' | '[' | '(' | '<');
2534        let closing = Self::paired_closing(delimiter);
2535
2536        while let Some(ch) = self.current_char() {
2537            // Check budget
2538            if let Some(token) = self.budget_guard(start, depth) {
2539                return Some(token);
2540            }
2541
2542            match ch {
2543                '\\' => {
2544                    self.advance();
2545                    if self.current_char().is_some() {
2546                        self.advance();
2547                    }
2548                }
2549                _ if ch == delimiter && is_paired => {
2550                    depth += 1;
2551                    self.advance();
2552                }
2553                _ if ch == closing => {
2554                    self.advance();
2555                    if is_paired {
2556                        depth = depth.saturating_sub(1);
2557                        if depth == 0 {
2558                            break;
2559                        }
2560                    } else {
2561                        break;
2562                    }
2563                }
2564                _ => self.advance(),
2565            }
2566        }
2567
2568        // Parse replacement - may use different delimiter for paired patterns (e.g., s[foo]{bar})
2569        // MUT_002 fix: Detect the actual replacement delimiter instead of assuming same as pattern
2570        // Note: Pattern scanning is complete at this point; we use a separate repl_depth for replacement
2571        let (repl_delimiter, repl_closing, repl_is_paired) = if is_paired {
2572            // Skip whitespace between pattern and replacement for paired delimiters
2573            while let Some(ch) = self.current_char() {
2574                if ch.is_whitespace() {
2575                    self.advance();
2576                } else {
2577                    break;
2578                }
2579            }
2580
2581            // Detect replacement delimiter - may be different from pattern delimiter
2582            if let Some(repl_delim) = self.current_char() {
2583                if matches!(repl_delim, '{' | '[' | '(' | '<') {
2584                    let repl_close = Self::paired_closing(repl_delim);
2585                    self.advance();
2586                    (repl_delim, repl_close, true)
2587                } else {
2588                    // Non-paired replacement after paired pattern (unusual but valid)
2589                    self.advance();
2590                    (repl_delim, repl_delim, false)
2591                }
2592            } else {
2593                // End of input - return what we have
2594                (delimiter, closing, is_paired)
2595            }
2596        } else {
2597            // Non-paired delimiter - replacement uses same delimiter
2598            (delimiter, closing, false)
2599        };
2600
2601        // Use separate depth counter for replacement to avoid confusion with pattern depth
2602        let mut repl_depth: usize = 1;
2603        while let Some(ch) = self.current_char() {
2604            match ch {
2605                '\\' => {
2606                    self.advance();
2607                    if self.current_char().is_some() {
2608                        self.advance();
2609                    }
2610                }
2611                _ if ch == repl_delimiter && repl_is_paired => {
2612                    repl_depth += 1;
2613                    self.advance();
2614                }
2615                _ if ch == repl_closing => {
2616                    self.advance();
2617                    if repl_is_paired {
2618                        repl_depth = repl_depth.saturating_sub(1);
2619                        if repl_depth == 0 {
2620                            break;
2621                        }
2622                    } else {
2623                        break;
2624                    }
2625                }
2626                _ => self.advance(),
2627            }
2628        }
2629
2630        // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
2631        while let Some(ch) = self.current_char() {
2632            if ch.is_ascii_alphanumeric() {
2633                self.advance();
2634            } else {
2635                break;
2636            }
2637        }
2638
2639        let text = &self.input[start..self.position];
2640        self.mode = LexerMode::ExpectOperator;
2641
2642        Some(Token {
2643            token_type: TokenType::Substitution,
2644            text: Arc::from(text),
2645            start,
2646            end: self.position,
2647        })
2648    }
2649
2650    fn parse_transliteration(&mut self, start: usize) -> Option<Token> {
2651        // We've already consumed 'tr' or 'y'
2652        let delimiter = self.current_char()?;
2653        self.advance(); // Skip delimiter
2654
2655        // Parse search list
2656        let mut depth = 1;
2657        let is_paired = matches!(delimiter, '{' | '[' | '(' | '<');
2658        let closing = Self::paired_closing(delimiter);
2659
2660        while let Some(ch) = self.current_char() {
2661            // Check budget
2662            if let Some(token) = self.budget_guard(start, depth) {
2663                return Some(token);
2664            }
2665
2666            match ch {
2667                '\\' => {
2668                    self.advance();
2669                    if self.current_char().is_some() {
2670                        self.advance();
2671                    }
2672                }
2673                _ if ch == delimiter && is_paired => {
2674                    depth += 1;
2675                    self.advance();
2676                }
2677                _ if ch == closing => {
2678                    self.advance();
2679                    if is_paired {
2680                        depth = depth.saturating_sub(1);
2681                        if depth == 0 {
2682                            break;
2683                        }
2684                    } else {
2685                        break;
2686                    }
2687                }
2688                _ => self.advance(),
2689            }
2690        }
2691
2692        // Parse replacement list - same delimiter handling
2693        if is_paired {
2694            // Skip whitespace between search and replace for paired delimiters
2695            while let Some(ch) = self.current_char() {
2696                if ch.is_whitespace() {
2697                    self.advance();
2698                } else {
2699                    break;
2700                }
2701            }
2702
2703            // Expect opening delimiter for replacement
2704            if self.current_char() == Some(delimiter) {
2705                self.advance();
2706                depth = 1;
2707            }
2708        }
2709
2710        while let Some(ch) = self.current_char() {
2711            match ch {
2712                '\\' => {
2713                    self.advance();
2714                    if self.current_char().is_some() {
2715                        self.advance();
2716                    }
2717                }
2718                _ if ch == delimiter && is_paired => {
2719                    depth += 1;
2720                    self.advance();
2721                }
2722                _ if ch == closing => {
2723                    self.advance();
2724                    if is_paired {
2725                        depth = depth.saturating_sub(1);
2726                        if depth == 0 {
2727                            break;
2728                        }
2729                    } else {
2730                        break;
2731                    }
2732                }
2733                _ => self.advance(),
2734            }
2735        }
2736
2737        // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
2738        while let Some(ch) = self.current_char() {
2739            if ch.is_ascii_alphanumeric() {
2740                self.advance();
2741            } else {
2742                break;
2743            }
2744        }
2745
2746        let text = &self.input[start..self.position];
2747        self.mode = LexerMode::ExpectOperator;
2748
2749        Some(Token {
2750            token_type: TokenType::Transliteration,
2751            text: Arc::from(text),
2752            start,
2753            end: self.position,
2754        })
2755    }
2756
2757    /// Read content between delimiters
2758    fn read_delimited_body(&mut self, delim: char) -> String {
2759        let paired = quote_handler::paired_close(delim);
2760        let close = paired.unwrap_or(delim);
2761        let mut body = String::new();
2762        let mut depth = i32::from(paired.is_some());
2763
2764        while let Some(ch) = self.current_char() {
2765            if ch == '\\' {
2766                body.push(ch);
2767                self.advance();
2768                if let Some(next) = self.current_char() {
2769                    body.push(next);
2770                    self.advance();
2771                }
2772                continue;
2773            }
2774
2775            if paired.is_some() && ch == delim {
2776                body.push(ch);
2777                self.advance();
2778                depth += 1;
2779                continue;
2780            }
2781
2782            if ch == close {
2783                if paired.is_some() {
2784                    depth -= 1;
2785                    if depth == 0 {
2786                        self.advance();
2787                        break;
2788                    }
2789                    body.push(ch);
2790                    self.advance();
2791                } else {
2792                    self.advance();
2793                    break;
2794                }
2795                continue;
2796            }
2797
2798            body.push(ch);
2799            self.advance();
2800        }
2801
2802        body
2803    }
2804
2805    /// Parse a quote operator after we've seen the delimiter
2806    fn parse_quote_operator(&mut self, delimiter: char) -> Option<Token> {
2807        let info = self.current_quote_op.as_ref()?;
2808        let start = info.start_pos;
2809        let operator = info.operator.clone();
2810
2811        // Parse based on operator type
2812        match operator.as_str() {
2813            "s" => {
2814                // Substitution: two bodies
2815                let _pattern = self.read_delimited_body(delimiter);
2816
2817                // For paired delimiters, skip whitespace between bodies
2818                if quote_handler::paired_close(delimiter).is_some() {
2819                    while let Some(ch) = self.current_char() {
2820                        if ch.is_whitespace() {
2821                            self.advance();
2822                        } else {
2823                            break;
2824                        }
2825                    }
2826                    // Expect same delimiter for replacement
2827                    if self.current_char() == Some(delimiter) {
2828                        self.advance();
2829                    }
2830                }
2831
2832                let _replacement = self.read_delimited_body(delimiter);
2833
2834                // Parse modifiers
2835                self.parse_regex_modifiers(&quote_handler::S_SPEC);
2836            }
2837            "tr" | "y" => {
2838                // Transliteration: two bodies
2839                let _from = self.read_delimited_body(delimiter);
2840
2841                // For paired delimiters, skip whitespace between bodies
2842                if quote_handler::paired_close(delimiter).is_some() {
2843                    while let Some(ch) = self.current_char() {
2844                        if ch.is_whitespace() {
2845                            self.advance();
2846                        } else {
2847                            break;
2848                        }
2849                    }
2850                    // Expect same delimiter for replacement
2851                    if self.current_char() == Some(delimiter) {
2852                        self.advance();
2853                    }
2854                }
2855
2856                let _to = self.read_delimited_body(delimiter);
2857
2858                // Parse modifiers
2859                self.parse_regex_modifiers(&quote_handler::TR_SPEC);
2860            }
2861            "qr" => {
2862                let _pattern = self.read_delimited_body(delimiter);
2863                self.parse_regex_modifiers(&quote_handler::QR_SPEC);
2864            }
2865            "m" => {
2866                let _pattern = self.read_delimited_body(delimiter);
2867                self.parse_regex_modifiers(&quote_handler::M_SPEC);
2868            }
2869            _ => {
2870                // q, qq, qw, qx - no modifiers
2871                let _body = self.read_delimited_body(delimiter);
2872            }
2873        }
2874
2875        let text = &self.input[start..self.position];
2876        let token_type = quote_handler::get_quote_token_type(&operator);
2877
2878        self.mode = LexerMode::ExpectOperator;
2879        self.current_quote_op = None;
2880
2881        Some(Token { token_type, text: Arc::from(text), start, end: self.position })
2882    }
2883
2884    /// Parse regex modifiers according to the given spec
2885    ///
2886    /// This function includes ALL characters that could be intended as modifiers,
2887    /// including invalid ones. This allows the parser to properly reject invalid
2888    /// modifiers with a clear error message, rather than leaving them as separate
2889    /// tokens that could be confusingly parsed.
2890    fn parse_regex_modifiers(&mut self, _spec: &quote_handler::ModSpec) {
2891        // Consume all alphanumeric characters that could be intended as modifiers
2892        // The parser will validate and reject invalid ones
2893        while let Some(ch) = self.current_char() {
2894            if ch.is_ascii_alphanumeric() {
2895                self.advance();
2896            } else {
2897                break;
2898            }
2899        }
2900        // Note: We no longer validate here - the parser will validate and provide
2901        // clear error messages for invalid modifiers (MUT_005 fix)
2902    }
2903
2904    /// Parse a regex literal starting with `/`
2905    ///
2906    /// **Timeout Protection (Issue #422)**:
2907    /// - Budget guard prevents infinite loops on pathological input
2908    /// - MAX_REGEX_BYTES limit (64KB) ensures bounded execution time
2909    /// - Graceful degradation: emit UnknownRest token if budget exceeded
2910    ///
2911    /// **Performance**:
2912    /// - Single-pass scanning with escape handling
2913    /// - Budget check per iteration (amortized O(1) via inline fast path)
2914    /// - Typical regex: <10μs, Large regex (64KB): ~1ms
2915    fn parse_regex(&mut self, start: usize) -> Option<Token> {
2916        self.advance(); // Skip opening /
2917
2918        while let Some(ch) = self.current_char() {
2919            // Budget guard: prevent timeout on pathological input (Issue #422)
2920            // If exceeded, returns UnknownRest token for graceful degradation
2921            if let Some(token) = self.budget_guard(start, 0) {
2922                return Some(token);
2923            }
2924
2925            match ch {
2926                '/' => {
2927                    self.advance();
2928                    // Parse flags - include all alphanumeric for proper validation in parser (MUT_005 fix)
2929                    while let Some(ch) = self.current_char() {
2930                        if ch.is_ascii_alphanumeric() {
2931                            self.advance();
2932                        } else {
2933                            break;
2934                        }
2935                    }
2936
2937                    let text = &self.input[start..self.position];
2938                    self.mode = LexerMode::ExpectOperator;
2939
2940                    return Some(Token {
2941                        token_type: TokenType::RegexMatch,
2942                        text: Arc::from(text),
2943                        start,
2944                        end: self.position,
2945                    });
2946                }
2947                '\\' => {
2948                    // Handle escape sequences: consume backslash + next char
2949                    self.advance();
2950                    if self.current_char().is_some() {
2951                        self.advance();
2952                    }
2953                }
2954                _ => self.advance(),
2955            }
2956        }
2957
2958        // Unterminated regex - EOF reached before closing /
2959        // Parser will emit diagnostic for unterminated literal
2960        None
2961    }
2962}
2963
2964// Pre-allocated empty Arc to avoid repeated allocations
2965static EMPTY_ARC: OnceLock<Arc<str>> = OnceLock::new();
2966
2967#[inline(always)]
2968fn empty_arc() -> Arc<str> {
2969    EMPTY_ARC.get_or_init(|| Arc::from("")).clone()
2970}
2971
2972#[inline(always)]
2973fn is_keyword(word: &str) -> bool {
2974    // Fast length-based rejection for most cases.
2975    // Lexer keywords are currently bounded to 1..=9 characters.
2976    matches!(word.len(), 1..=9) && is_lexer_keyword(word)
2977}
2978
2979/// Fast lookup table for compound operator second characters
2980const COMPOUND_SECOND_CHARS: &[u8] = b"=<>&|+->.~*";
2981
2982#[inline]
2983fn is_compound_operator(first: char, second: char) -> bool {
2984    // Optimized compound operator lookup using perfect hashing for common cases
2985    // Convert to bytes for faster comparison (most operators are ASCII)
2986    if first.is_ascii() && second.is_ascii() {
2987        let first_byte = first as u8;
2988        let second_byte = second as u8;
2989
2990        if !COMPOUND_SECOND_CHARS.contains(&second_byte) {
2991            return false;
2992        }
2993
2994        // Use lookup table approach for maximum performance
2995        match (first_byte, second_byte) {
2996            // Assignment operators
2997            (b'+' | b'-' | b'*' | b'/' | b'%' | b'&' | b'|' | b'^' | b'.', b'=') => true,
2998
2999            // Comparison operators
3000            (b'<' | b'>' | b'=' | b'!', b'=') => true,
3001
3002            // Pattern operators
3003            (b'=' | b'!', b'~') => true,
3004
3005            // Increment/decrement
3006            (b'+', b'+') | (b'-', b'-') => true,
3007
3008            // Logical operators
3009            (b'&', b'&') | (b'|', b'|') => true,
3010
3011            // Shift operators
3012            (b'<', b'<') | (b'>', b'>') => true,
3013
3014            // Other compound operators
3015            (b'*', b'*')
3016            | (b'/', b'/')
3017            | (b'-' | b'=', b'>')
3018            | (b'.', b'.')
3019            | (b'~', b'~')
3020            | (b':', b':') => true,
3021
3022            _ => false,
3023        }
3024    } else {
3025        // Fallback for non-ASCII (should be rare)
3026        matches!(
3027            (first, second),
3028            ('+' | '-' | '*' | '/' | '%' | '&' | '|' | '^' | '.' | '<' | '>' | '=' | '!', '=')
3029                | ('=' | '!' | '~', '~')
3030                | ('+', '+')
3031                | ('-', '-' | '>')
3032                | ('&', '&')
3033                | ('|', '|')
3034                | ('<', '<')
3035                | ('>' | '=', '>')
3036                | ('*', '*')
3037                | ('/', '/')
3038                | ('.', '.')
3039                | (':', ':')
3040        )
3041    }
3042}
3043
3044// Checkpoint support for incremental parsing
3045impl Checkpointable for PerlLexer<'_> {
3046    fn checkpoint(&self) -> LexerCheckpoint {
3047        use checkpoint::CheckpointContext;
3048
3049        // Determine the checkpoint context based on current state
3050        let context = if matches!(self.mode, LexerMode::InFormatBody) {
3051            CheckpointContext::Format {
3052                start_position: self.position.saturating_sub(100), // Approximate
3053            }
3054        } else if !self.delimiter_stack.is_empty() {
3055            // We're in some kind of quote-like construct
3056            CheckpointContext::QuoteLike {
3057                operator: String::new(), // Would need to track this
3058                delimiter: self.delimiter_stack.last().copied().unwrap_or('\0'),
3059                is_paired: true,
3060            }
3061        } else {
3062            CheckpointContext::Normal
3063        };
3064
3065        LexerCheckpoint {
3066            position: self.position,
3067            mode: self.mode,
3068            delimiter_stack: self.delimiter_stack.clone(),
3069            in_prototype: self.in_prototype,
3070            prototype_depth: self.prototype_depth,
3071            current_pos: self.current_pos,
3072            context,
3073        }
3074    }
3075
3076    fn restore(&mut self, checkpoint: &LexerCheckpoint) {
3077        self.position = checkpoint.position;
3078        self.mode = checkpoint.mode;
3079        self.delimiter_stack.clone_from(&checkpoint.delimiter_stack);
3080        self.in_prototype = checkpoint.in_prototype;
3081        self.prototype_depth = checkpoint.prototype_depth;
3082        self.current_pos = checkpoint.current_pos;
3083
3084        // Handle special contexts
3085        use checkpoint::CheckpointContext;
3086        if let CheckpointContext::Format { .. } = &checkpoint.context {
3087            // Ensure we're in format body mode
3088            if !matches!(self.mode, LexerMode::InFormatBody) {
3089                self.mode = LexerMode::InFormatBody;
3090            }
3091        }
3092    }
3093
3094    fn can_restore(&self, checkpoint: &LexerCheckpoint) -> bool {
3095        // Can restore if the position is valid for our input
3096        checkpoint.position <= self.input.len()
3097    }
3098}
3099
3100#[cfg(test)]
3101mod test_format_debug;
3102
3103#[cfg(test)]
3104mod tests {
3105    use super::*;
3106
3107    type TestResult = std::result::Result<(), Box<dyn std::error::Error>>;
3108
3109    #[test]
3110    fn test_basic_tokens() -> TestResult {
3111        let mut lexer = PerlLexer::new("my $x = 42;");
3112
3113        let token = lexer.next_token().ok_or("Expected keyword token")?;
3114        assert_eq!(token.token_type, TokenType::Keyword(Arc::from("my")));
3115
3116        let token = lexer.next_token().ok_or("Expected identifier token")?;
3117        assert!(matches!(token.token_type, TokenType::Identifier(_)));
3118
3119        let token = lexer.next_token().ok_or("Expected operator token")?;
3120        assert!(matches!(token.token_type, TokenType::Operator(_)));
3121
3122        let token = lexer.next_token().ok_or("Expected number token")?;
3123        assert!(matches!(token.token_type, TokenType::Number(_)));
3124
3125        let token = lexer.next_token().ok_or("Expected semicolon token")?;
3126        assert_eq!(token.token_type, TokenType::Semicolon);
3127        Ok(())
3128    }
3129
3130    #[test]
3131    fn test_slash_disambiguation() -> TestResult {
3132        // Division
3133        let mut lexer = PerlLexer::new("10 / 2");
3134        lexer.next_token(); // 10
3135        let token = lexer.next_token().ok_or("Expected division token")?;
3136        assert_eq!(token.token_type, TokenType::Division);
3137
3138        // Regex
3139        let mut lexer = PerlLexer::new("if (/pattern/)");
3140        lexer.next_token(); // if
3141        lexer.next_token(); // (
3142        let token = lexer.next_token().ok_or("Expected regex token")?;
3143        assert_eq!(token.token_type, TokenType::RegexMatch);
3144        Ok(())
3145    }
3146
3147    #[test]
3148    fn test_percent_and_double_sigil_disambiguation() -> TestResult {
3149        // Hash variable
3150        let mut lexer = PerlLexer::new("%hash");
3151        let token = lexer.next_token().ok_or("Expected hash identifier token")?;
3152        assert!(
3153            matches!(token.token_type, TokenType::Identifier(ref id) if id.as_ref() == "%hash")
3154        );
3155
3156        // Modulo operator
3157        let mut lexer = PerlLexer::new("10 % 3");
3158        lexer.next_token(); // 10
3159        let token = lexer.next_token().ok_or("Expected modulo operator token")?;
3160        assert!(matches!(token.token_type, TokenType::Operator(ref op) if op.as_ref() == "%"));
3161        Ok(())
3162    }
3163
3164    #[test]
3165    fn test_defined_or_and_exponent() -> TestResult {
3166        // Defined-or operator
3167        let mut lexer = PerlLexer::new("$a // $b");
3168        lexer.next_token(); // $a
3169        let token = lexer.next_token().ok_or("Expected defined-or operator token")?;
3170        assert!(matches!(token.token_type, TokenType::Operator(ref op) if op.as_ref() == "//"));
3171
3172        // Regex after =~ should still parse
3173        let mut lexer = PerlLexer::new("$x =~ //");
3174        lexer.next_token(); // $x
3175        lexer.next_token(); // =~
3176        let token = lexer.next_token().ok_or("Expected regex token")?;
3177        assert_eq!(token.token_type, TokenType::RegexMatch);
3178
3179        // Exponent operator
3180        let mut lexer = PerlLexer::new("2 ** 3");
3181        lexer.next_token(); // 2
3182        let token = lexer.next_token().ok_or("Expected exponent operator token")?;
3183        assert!(matches!(token.token_type, TokenType::Operator(ref op) if op.as_ref() == "**"));
3184        Ok(())
3185    }
3186}
perl_lexer/lib.rs

perl_lexer/
lib.rs