perl_lexer/lib.rs
1//! Context-aware Perl lexer with mode-based tokenization
2//!
3//! This crate provides a high-performance lexer for Perl that handles the inherently
4//! context-sensitive nature of the language. The lexer uses a mode-tracking system to
5//! correctly disambiguate ambiguous syntax like `/` (division vs. regex) and properly
6//! parse complex constructs like heredocs, quote-like operators, and nested delimiters.
7//!
8//! # Architecture
9//!
10//! The lexer is organized around several key concepts:
11//!
12//! - **Mode Tracking**: [`LexerMode`] tracks whether the parser expects a term or an operator,
13//! enabling correct disambiguation of context-sensitive tokens.
14//! - **Checkpointing**: [`LexerCheckpoint`] and [`Checkpointable`] support incremental parsing
15//! by allowing the lexer state to be saved and restored.
16//! - **Budget Limits**: Protection against pathological input with configurable size limits
17//! for regex patterns, heredoc bodies, and delimiter nesting depth.
18//! - **Position Tracking**: [`Position`] maintains line/column information for error reporting
19//! and LSP integration.
20//! - **Unicode Support**: Full Unicode identifier support following Perl 5.14+ semantics.
21//!
22//! # Usage
23//!
24//! ## Basic Tokenization
25//!
26//! ```rust
27//! use perl_lexer::{PerlLexer, TokenType};
28//!
29//! let mut lexer = PerlLexer::new("my $x = 42;");
30//! let tokens = lexer.collect_tokens();
31//!
32//! // First token is the keyword `my`
33//! assert!(matches!(&tokens[0].token_type, TokenType::Keyword(k) if &**k == "my"));
34//! // Tokens include variables, operators, literals, and EOF
35//! assert!(matches!(&tokens.last().map(|t| &t.token_type), Some(TokenType::EOF)));
36//! ```
37//!
38//! ## Context-Aware Parsing
39//!
40//! The lexer automatically tracks context to disambiguate operators:
41//!
42//! ```rust
43//! use perl_lexer::{PerlLexer, TokenType};
44//!
45//! // Division operator (after a term)
46//! let mut lexer = PerlLexer::new("42 / 2");
47//! // Regex operator (at start of expression)
48//! let mut lexer2 = PerlLexer::new("/pattern/");
49//! ```
50//!
51//! ## Checkpointing for Incremental Parsing
52//!
53//! ```rust,ignore
54//! use perl_lexer::{PerlLexer, Checkpointable};
55//!
56//! let mut lexer = PerlLexer::new("my $x = 1;");
57//! let checkpoint = lexer.checkpoint();
58//!
59//! // Parse some tokens
60//! let _ = lexer.next_token();
61//!
62//! // Restore to checkpoint
63//! lexer.restore(&checkpoint);
64//! ```
65//!
66//! ## Configuration Options
67//!
68//! ```rust
69//! use perl_lexer::{PerlLexer, LexerConfig};
70//!
71//! let config = LexerConfig {
72//! parse_interpolation: true, // Parse string interpolation
73//! track_positions: true, // Track line/column positions
74//! max_lookahead: 1024, // Maximum lookahead for disambiguation
75//! };
76//!
77//! let mut lexer = PerlLexer::with_config("my $x = 1;", config);
78//! ```
79//!
80//! # Context Sensitivity Examples
81//!
82//! Perl's grammar is highly context-sensitive. The lexer handles these cases:
83//!
84//! - **Division vs. Regex**: `/` is division after terms, regex at expression start
85//! - **Modulo vs. Hash Sigil**: `%` is modulo after terms, hash sigil at expression start
86//! - **Glob vs. Exponent**: `**` can be exponentiation or glob pattern start
87//! - **Defined-or vs. Regex**: `//` is defined-or after terms, regex at expression start
88//! - **Heredoc Markers**: `<<` can be left shift, here-doc, or numeric less-than-less-than
89//!
90//! # Budget Limits
91//!
92//! To prevent hangs on pathological input, the lexer enforces these limits:
93//!
94//! - **MAX_REGEX_BYTES**: 64KB maximum for regex patterns
95//! - **MAX_HEREDOC_BYTES**: 256KB maximum for heredoc bodies
96//! - **MAX_DELIM_NEST**: 128 levels maximum nesting depth for delimiters
97//! - **MAX_REGEX_PARSE_STEPS**: 32K maximum scan iterations for regex literals
98//!
99//! When limits are exceeded, the lexer emits an `UnknownRest` token preserving
100//! all previously parsed symbols, allowing continued analysis.
101//!
102//! # Integration with perl-parser
103//!
104//! The lexer is designed to work seamlessly with `perl_parser_core::Parser`.
105//! You rarely need to use the lexer directly -- the parser creates and manages
106//! a `PerlLexer` instance internally:
107//!
108//! ```rust,ignore
109//! use perl_parser_core::Parser;
110//!
111//! let code = r#"sub hello { print "Hello, world!\n"; }"#;
112//! let mut parser = Parser::new(code);
113//! let ast = parser.parse().expect("should parse");
114//! ```
115
116#![warn(clippy::all)]
117#![allow(
118 // Core allows for lexer code
119 clippy::too_many_lines,
120 clippy::module_name_repetitions,
121 clippy::cast_possible_truncation,
122 clippy::cast_sign_loss,
123 clippy::cast_possible_wrap,
124 clippy::cast_precision_loss,
125 clippy::must_use_candidate,
126 clippy::missing_errors_doc,
127 clippy::missing_panics_doc,
128
129 // Lexer-specific patterns that are fine
130 clippy::match_same_arms,
131 clippy::redundant_else,
132 clippy::unnecessary_wraps,
133 clippy::unused_self,
134 clippy::items_after_statements,
135 clippy::struct_excessive_bools,
136 clippy::uninlined_format_args
137)]
138
139use std::sync::{Arc, OnceLock};
140
141pub mod api;
142pub mod builtins;
143pub mod checkpoint;
144pub mod config;
145pub mod error;
146mod heredoc;
147pub mod keywords;
148mod lexer;
149pub mod limits;
150pub mod mode;
151mod quote_handler;
152pub mod token;
153pub mod tokenizer;
154mod unicode;
155
156pub use api::*;
157pub use checkpoint::{CheckpointCache, Checkpointable, LexerCheckpoint};
158pub use config::LexerConfig;
159pub use error::{LexerError, Result};
160pub use lexer::PerlLexer;
161pub use limits::MAX_REGEX_PARSE_STEPS;
162pub use mode::LexerMode;
163pub use perl_position_tracking::Position;
164pub use token::{StringPart, Token, TokenType};
165
166use unicode::{is_perl_identifier_continue, is_perl_identifier_start};
167
168use crate::heredoc::HeredocSpec;
169use crate::limits::{
170 HEREDOC_TIMEOUT_MS, MAX_DELIM_NEST, MAX_HEREDOC_BYTES, MAX_HEREDOC_DEPTH, MAX_REGEX_BYTES,
171};
172
173impl<'a> PerlLexer<'a> {
174 /// Create a new lexer that emits `HeredocBody` tokens (for LSP folding)
175 pub fn with_body_tokens(input: &'a str) -> Self {
176 let mut lexer = Self::new(input);
177 lexer.emit_heredoc_body_tokens = true;
178 lexer
179 }
180
181 /// Set the lexer mode (for resetting state at statement boundaries)
182 pub fn set_mode(&mut self, mode: LexerMode) {
183 self.mode = mode;
184 }
185
186 /// Advance the lexer and return the next token.
187 ///
188 /// Returns `None` only after an `EOF` token has already been emitted.
189 /// The final meaningful call returns `Some(Token { token_type: TokenType::EOF, .. })`.
190 pub fn next_token(&mut self) -> Option<Token> {
191 // Normalize file start (BOM) once
192 if self.position == 0 {
193 self.normalize_file_start();
194 }
195
196 // Loop to avoid recursion when processing heredocs
197 loop {
198 // Handle format body parsing if we're in that mode
199 if matches!(self.mode, LexerMode::InFormatBody) {
200 return self.parse_format_body();
201 }
202
203 // Handle data section parsing if we're in that mode
204 if matches!(self.mode, LexerMode::InDataSection) {
205 return self.parse_data_body();
206 }
207
208 // Check if we're inside a heredoc body BEFORE skipping whitespace
209 let mut found_terminator = false;
210 if !self.pending_heredocs.is_empty() {
211 // Clone what we need to avoid holding a borrow
212 let (body_start, label, allow_indent) =
213 if let Some(spec) = self.pending_heredocs.first() {
214 if spec.body_start > 0
215 && self.position >= spec.body_start
216 && self.position < self.input.len()
217 {
218 (spec.body_start, spec.label.clone(), spec.allow_indent)
219 } else {
220 // Not in a heredoc body yet or at EOF
221 (0, empty_arc(), false)
222 }
223 } else {
224 (0, empty_arc(), false)
225 };
226
227 if body_start > 0 {
228 // We're inside a heredoc body - scan for the terminator
229
230 // Scan line by line looking for the terminator
231 while self.position < self.input.len() {
232 // Timeout protection (Issue #443)
233 if self.start_time.elapsed().as_millis() > HEREDOC_TIMEOUT_MS as u128 {
234 self.pending_heredocs.remove(0);
235 self.position = self.input.len();
236 return Some(Token {
237 token_type: TokenType::Error(Arc::from("Heredoc parsing timeout")),
238 text: Arc::from(&self.input[body_start..]),
239 start: body_start,
240 end: self.input.len(),
241 });
242 }
243
244 // Budget cap for huge bodies - optimized check
245 if self.position - body_start > MAX_HEREDOC_BYTES {
246 // Remove the pending heredoc to avoid infinite loop
247 self.pending_heredocs.remove(0);
248 self.position = self.input.len();
249 return Some(Token {
250 token_type: TokenType::UnknownRest,
251 text: Arc::from(&self.input[body_start..]),
252 start: body_start,
253 end: self.input.len(),
254 });
255 }
256
257 // Skip to start of next line if not at line start
258 // Exception: if we're at body_start exactly, we're at the heredoc body start
259 if !self.after_newline && self.position != body_start {
260 while self.position < self.input.len()
261 && self.input_bytes[self.position] != b'\n'
262 && self.input_bytes[self.position] != b'\r'
263 {
264 self.advance();
265 }
266 self.consume_newline();
267 continue;
268 }
269
270 // We're at line start - check if this line is the terminator
271 let line_start = self.position;
272 let (line_end, line_visible_end) =
273 Self::find_line_end(self.input_bytes, self.position);
274 let line = &self.input[line_start..line_visible_end];
275 // Strip trailing spaces/tabs (Perl allows them)
276 let trimmed_end = line.trim_end_matches([' ', '\t']);
277
278 // Check if this line is the terminator
279 let is_terminator = if allow_indent {
280 // Allow any leading spaces/tabs before the label
281 let mut p = 0;
282 while p < trimmed_end.len() {
283 let b = trimmed_end.as_bytes()[p];
284 if b == b' ' || b == b'\t' {
285 p += 1;
286 } else {
287 break;
288 }
289 }
290 trimmed_end[p..] == *label
291 } else {
292 // Must start at column 0 (no leading whitespace)
293 // The terminator is just the label (already trimmed trailing whitespace)
294 trimmed_end == &*label
295 };
296
297 if is_terminator {
298 // Found the terminator!
299 self.pending_heredocs.remove(0);
300 found_terminator = true;
301
302 // Consume past the terminator line
303 self.position = line_end;
304 self.consume_newline();
305
306 // Set body_start for the next pending heredoc (if any)
307 if let Some(next) = self.pending_heredocs.first_mut()
308 && next.body_start == 0
309 {
310 next.body_start = self.position;
311 }
312
313 // Only emit HeredocBody if requested (for folding)
314 if self.emit_heredoc_body_tokens {
315 return Some(Token {
316 token_type: TokenType::HeredocBody(empty_arc()),
317 text: empty_arc(),
318 start: body_start,
319 end: line_start,
320 });
321 }
322 // Otherwise, continue the outer loop to get the next real token (avoiding recursion)
323 break; // Break inner while loop, continue outer loop
324 }
325
326 // Not the terminator, continue to next line
327 self.position = line_end;
328 self.consume_newline();
329 }
330
331 // If we didn't find a terminator, we reached EOF - emit error token
332 if !found_terminator {
333 // Remove the pending heredoc to avoid infinite loop
334 self.pending_heredocs.remove(0);
335 self.position = self.input.len();
336 return Some(Token {
337 token_type: TokenType::UnknownRest,
338 text: Arc::from(&self.input[body_start..]),
339 start: body_start,
340 end: self.input.len(),
341 });
342 }
343 }
344
345 // If we found a terminator, continue outer loop to get next token
346 if found_terminator {
347 continue; // Continue outer loop to get next token
348 }
349 }
350
351 self.skip_whitespace_and_comments()?;
352
353 // Check again if we're now in a heredoc body (might have been set during skip_whitespace)
354 if !self.pending_heredocs.is_empty()
355 && let Some(spec) = self.pending_heredocs.first()
356 && spec.body_start > 0
357 && self.position >= spec.body_start
358 && self.position < self.input.len()
359 {
360 continue; // Go back to top of loop to process heredoc
361 }
362
363 // If we reach EOF with pending heredocs, clear them and emit EOF
364 if self.position >= self.input.len() && !self.pending_heredocs.is_empty() {
365 self.pending_heredocs.clear();
366 }
367
368 if self.position >= self.input.len() {
369 if self.eof_emitted {
370 return None; // Stop the stream
371 }
372 self.eof_emitted = true;
373 return Some(Token {
374 token_type: TokenType::EOF,
375 text: empty_arc(),
376 start: self.position,
377 end: self.position,
378 });
379 }
380
381 let start = self.position;
382
383 // Check for special tokens first
384 if let Some(token) = self.try_heredoc() {
385 return Some(token);
386 }
387
388 if let Some(token) = self.try_string() {
389 return Some(token);
390 }
391
392 if let Some(token) = self.try_variable() {
393 return Some(token);
394 }
395
396 if let Some(token) = self.try_number() {
397 return Some(token);
398 }
399
400 if let Some(token) = self.try_vstring() {
401 return Some(token);
402 }
403
404 if let Some(token) = self.try_identifier_or_keyword() {
405 return Some(token);
406 }
407
408 // If we're expecting a delimiter for a quote operator, only try delimiter
409 if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
410 if let Some(token) = self.try_delimiter() {
411 return Some(token);
412 }
413 // Do NOT fall through to try_operator / try_punct / etc.
414 // Clear state first so we don't spin
415 self.mode = LexerMode::ExpectOperator;
416 self.current_quote_op = None;
417 continue;
418 }
419
420 if let Some(token) = self.try_operator() {
421 return Some(token);
422 }
423
424 if let Some(token) = self.try_delimiter() {
425 return Some(token);
426 }
427
428 // If nothing else matches, return an error token
429 let ch = self.current_char()?;
430 self.advance();
431
432 // Optimize error token creation - avoid expensive formatting in hot path
433 let text = if ch.is_ascii() {
434 // Fast path for ASCII characters
435 Arc::from(&self.input[start..self.position])
436 } else {
437 // Unicode path without intermediate heap allocation
438 let mut buf = [0_u8; 4];
439 Arc::from(ch.encode_utf8(&mut buf))
440 };
441
442 return Some(Token {
443 token_type: TokenType::Error(Arc::from("Unexpected character")),
444 text,
445 start,
446 end: self.position,
447 });
448 } // End of loop
449 }
450
451 /// Budget guard to prevent infinite loops and timeouts (Issue #422)
452 ///
453 /// **Purpose**: Protect against pathological input that could cause:
454 /// - Infinite loops in regex/heredoc parsing
455 /// - Excessive memory consumption
456 /// - LSP server hangs
457 ///
458 /// **Limits**:
459 /// - `MAX_REGEX_BYTES` (64KB): Maximum bytes in a single regex literal
460 /// - `MAX_DELIM_NEST` (128): Maximum delimiter nesting depth
461 ///
462 /// **Graceful Degradation**:
463 /// - Budget exceeded → emit `UnknownRest` token
464 /// - Jump to EOF to prevent further parsing of problematic region
465 /// - LSP client can emit soft diagnostic about truncation
466 /// - All previously parsed symbols remain valid
467 ///
468 /// **Performance**:
469 /// - Fast path: inlined subtraction + comparison (~1-2 CPU cycles)
470 /// - Slow path: Only triggered on pathological input
471 /// - Amortized cost: O(1) per token
472 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
473 #[inline(always)]
474 fn budget_guard(&mut self, start: usize, depth: usize) -> Option<Token> {
475 // Fast path: most calls won't hit limits
476 let bytes_consumed = self.position - start;
477 if bytes_consumed <= MAX_REGEX_BYTES && depth <= MAX_DELIM_NEST {
478 return None;
479 }
480
481 // Slow path: budget exceeded - graceful degradation
482 #[cfg(debug_assertions)]
483 {
484 tracing::debug!(
485 bytes_consumed,
486 depth,
487 position = self.position,
488 "Lexer budget exceeded"
489 );
490 }
491
492 self.position = self.input.len();
493 Some(Token {
494 token_type: TokenType::UnknownRest,
495 text: Arc::from(""),
496 start,
497 end: self.position,
498 })
499 }
500
501 /// Peek at the next token without consuming it.
502 ///
503 /// Saves and restores the full lexer state so the next call to
504 /// [`next_token`](Self::next_token) returns the same token.
505 pub fn peek_token(&mut self) -> Option<Token> {
506 let saved_pos = self.position;
507 let saved_mode = self.mode;
508 let saved_delimiter_stack = self.delimiter_stack.clone();
509 let saved_prototype = self.in_prototype;
510 let saved_depth = self.prototype_depth;
511 let saved_after_sub = self.after_sub;
512 let saved_after_arrow = self.after_arrow;
513 let saved_hash_brace_depth = self.hash_brace_depth;
514 let saved_after_var_subscript = self.after_var_subscript;
515 let saved_paren_depth = self.paren_depth;
516 let saved_current_pos = self.current_pos;
517 let saved_after_newline = self.after_newline;
518 let saved_pending_heredocs = self.pending_heredocs.clone();
519 let saved_line_start_offset = self.line_start_offset;
520 let saved_current_quote_op = self.current_quote_op.clone();
521 let saved_eof_emitted = self.eof_emitted;
522 let saved_start_time = self.start_time;
523
524 let token = self.next_token();
525
526 self.position = saved_pos;
527 self.mode = saved_mode;
528 self.delimiter_stack = saved_delimiter_stack;
529 self.in_prototype = saved_prototype;
530 self.prototype_depth = saved_depth;
531 self.after_sub = saved_after_sub;
532 self.after_arrow = saved_after_arrow;
533 self.hash_brace_depth = saved_hash_brace_depth;
534 self.after_var_subscript = saved_after_var_subscript;
535 self.paren_depth = saved_paren_depth;
536 self.current_pos = saved_current_pos;
537 self.after_newline = saved_after_newline;
538 self.pending_heredocs = saved_pending_heredocs;
539 self.line_start_offset = saved_line_start_offset;
540 self.current_quote_op = saved_current_quote_op;
541 self.eof_emitted = saved_eof_emitted;
542 self.start_time = saved_start_time;
543
544 token
545 }
546
547 /// Consume all remaining tokens and return them as a vector.
548 ///
549 /// The returned vector always ends with an `EOF` token.
550 pub fn collect_tokens(&mut self) -> Vec<Token> {
551 let mut tokens = Vec::new();
552 while let Some(token) = self.next_token() {
553 if token.token_type == TokenType::EOF {
554 tokens.push(token);
555 break;
556 }
557 tokens.push(token);
558 }
559 tokens
560 }
561
562 /// Reset the lexer to the beginning of the input.
563 ///
564 /// Clears all internal state (mode, delimiter stack, heredoc queue, etc.)
565 /// so the lexer can re-tokenize the same source from scratch.
566 pub fn reset(&mut self) {
567 self.position = 0;
568 self.mode = LexerMode::ExpectTerm;
569 self.delimiter_stack.clear();
570 self.in_prototype = false;
571 self.prototype_depth = 0;
572 self.after_sub = false;
573 self.after_arrow = false;
574 self.hash_brace_depth = 0;
575 self.after_var_subscript = false;
576 self.paren_depth = 0;
577 self.current_pos = Position::start();
578 self.after_newline = true;
579 self.pending_heredocs.clear();
580 self.line_start_offset = 0;
581 self.current_quote_op = None;
582 self.eof_emitted = false;
583 self.start_time = std::time::Instant::now();
584 }
585
586 /// Switch the lexer into format-body parsing mode.
587 ///
588 /// In this mode the lexer consumes input verbatim until it encounters a
589 /// line containing only `.` (the Perl format terminator).
590 pub fn enter_format_mode(&mut self) {
591 self.mode = LexerMode::InFormatBody;
592 }
593
594 // Internal helper methods
595
596 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
597 #[inline(always)]
598 fn byte_at(bytes: &[u8], index: usize) -> u8 {
599 debug_assert!(index < bytes.len());
600 match bytes.get(index) {
601 Some(&byte) => byte,
602 None => 0,
603 }
604 }
605
606 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
607 #[inline(always)]
608 fn current_char(&self) -> Option<char> {
609 if self.position < self.input_bytes.len() {
610 // For ASCII, direct access is safe
611 let byte = Self::byte_at(self.input_bytes, self.position);
612 if byte < 128 {
613 Some(byte as char)
614 } else {
615 // For non-ASCII, fall back to proper UTF-8 parsing
616 self.input.get(self.position..).and_then(|s| s.chars().next())
617 }
618 } else {
619 None
620 }
621 }
622
623 #[inline(always)]
624 fn peek_char(&self, offset: usize) -> Option<char> {
625 if offset > self.config.max_lookahead {
626 return None;
627 }
628
629 let pos = self.position.checked_add(offset)?;
630 if pos < self.input_bytes.len() {
631 // For ASCII, direct access is safe
632 let byte = Self::byte_at(self.input_bytes, pos);
633 if byte < 128 {
634 Some(byte as char)
635 } else {
636 // For non-ASCII, use chars iterator
637 self.input.get(self.position..).and_then(|s| s.chars().nth(offset))
638 }
639 } else {
640 None
641 }
642 }
643
644 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
645 #[inline(always)]
646 fn advance(&mut self) {
647 if self.position < self.input_bytes.len() {
648 let byte = Self::byte_at(self.input_bytes, self.position);
649 if byte < 128 {
650 // ASCII fast path
651 self.position += 1;
652 } else if let Some(ch) = self.input.get(self.position..).and_then(|s| s.chars().next())
653 {
654 self.position += ch.len_utf8();
655 }
656 }
657 }
658
659 /// General-purpose balanced-segment consumer (no quote-boundary recovery).
660 ///
661 /// For use inside double-quoted string interpolation where the outer `"` must
662 /// act as a recovery boundary, use [`consume_balanced_segment_in_string`] instead.
663 #[allow(dead_code)]
664 #[inline]
665 fn consume_balanced_segment(&mut self, open: char, close: char) -> Option<usize> {
666 if self.current_char() != Some(open) {
667 return None;
668 }
669
670 let mut depth = 1usize;
671 self.advance();
672 while let Some(ch) = self.current_char() {
673 match ch {
674 '\\' => {
675 self.advance();
676 if self.current_char().is_some() {
677 self.advance();
678 }
679 }
680 c if c == open => {
681 depth += 1;
682 self.advance();
683 }
684 c if c == close => {
685 self.advance();
686 depth -= 1;
687 if depth == 0 {
688 return Some(self.position);
689 }
690 }
691 _ => self.advance(),
692 }
693 }
694
695 None
696 }
697
698 #[inline]
699 fn consume_balanced_segment_in_string(
700 &mut self,
701 open: char,
702 close: char,
703 terminator: char,
704 ) -> Option<usize> {
705 if self.current_char() != Some(open) {
706 return None;
707 }
708
709 let mut depth = 1usize;
710 self.advance();
711 while let Some(ch) = self.current_char() {
712 match ch {
713 '\\' => {
714 self.advance();
715 if self.current_char().is_some() {
716 self.advance();
717 }
718 }
719 c if c == terminator => {
720 // Local recovery for interpolation tails in quoted strings:
721 // stop at the closing quote so the outer string parser can
722 // still terminate this token cleanly.
723 return None;
724 }
725 c if c == open => {
726 depth += 1;
727 self.advance();
728 }
729 c if c == close => {
730 self.advance();
731 depth -= 1;
732 if depth == 0 {
733 return Some(self.position);
734 }
735 }
736 _ => self.advance(),
737 }
738 }
739
740 None
741 }
742
743 /// Fast byte-level check for ASCII characters
744 #[inline]
745 fn peek_byte(&self, offset: usize) -> Option<u8> {
746 if offset > self.config.max_lookahead {
747 return None;
748 }
749
750 let pos = self.position.checked_add(offset)?;
751 if pos < self.input_bytes.len() { Some(self.input_bytes[pos]) } else { None }
752 }
753
754 /// Check if the next bytes match a pattern (ASCII only)
755 #[inline]
756 fn matches_bytes(&self, pattern: &[u8]) -> bool {
757 let Some(end_offset) = pattern.len().checked_sub(1) else {
758 return true;
759 };
760
761 if end_offset > self.config.max_lookahead {
762 return false;
763 }
764
765 let Some(end) = self.position.checked_add(pattern.len()) else {
766 return false;
767 };
768
769 if end <= self.input_bytes.len() {
770 &self.input_bytes[self.position..end] == pattern
771 } else {
772 false
773 }
774 }
775
776 #[inline]
777 fn skip_whitespace_and_comments(&mut self) -> Option<()> {
778 // Don't reset after_newline if we're at the start of a line
779 if self.position > 0 && self.position != self.line_start_offset {
780 self.after_newline = false;
781 }
782
783 while self.position < self.input_bytes.len() {
784 let byte = Self::byte_at(self.input_bytes, self.position);
785 match byte {
786 // Fast path for ASCII whitespace - batch process
787 b' ' => {
788 // Batch skip spaces for better cache efficiency
789 let start = self.position;
790 while self.position < self.input_bytes.len()
791 && Self::byte_at(self.input_bytes, self.position) == b' '
792 {
793 self.position += 1;
794 }
795 // Continue outer loop if we processed any spaces
796 if self.position > start {
797 // Loop naturally continues to next iteration
798 }
799 }
800 b'\t' | 0x0B | 0x0C => {
801 // Batch skip horizontal tab, vertical tab, and form feed.
802 // Perl treats these as whitespace separators.
803 let start = self.position;
804 while self.position < self.input_bytes.len()
805 && matches!(
806 Self::byte_at(self.input_bytes, self.position),
807 b'\t' | 0x0B | 0x0C
808 )
809 {
810 self.position += 1;
811 }
812 if self.position > start {
813 // Loop naturally continues to next iteration
814 }
815 }
816 b'\r' | b'\n' => {
817 self.consume_newline();
818
819 // Set body_start for the FIRST pending heredoc that needs it (FIFO)
820 // Only check if we have pending heredocs to avoid unnecessary work
821 if !self.pending_heredocs.is_empty() {
822 for spec in &mut self.pending_heredocs {
823 if spec.body_start == 0 {
824 spec.body_start = self.position;
825 break; // Only set for the first unresolved heredoc
826 }
827 }
828 }
829 }
830 b'#' => {
831 // In ExpectDelimiter mode, '#' is a delimiter, not a comment
832 if matches!(self.mode, LexerMode::ExpectDelimiter) {
833 break;
834 }
835
836 // Skip line comment using memchr for fast newline search
837 self.position += 1; // Skip # directly
838
839 // Use memchr2 to find CR/LF line endings quickly (supports LF, CRLF, and CR)
840 if let Some(newline_offset) =
841 memchr::memchr2(b'\n', b'\r', &self.input_bytes[self.position..])
842 {
843 self.position += newline_offset;
844 } else {
845 // No newline found, skip to end
846 self.position = self.input_bytes.len();
847 }
848 }
849 b'=' if self.position == 0
850 || (self.position > 0
851 && matches!(self.input_bytes[self.position - 1], b'\n' | b'\r')) =>
852 {
853 // Check if this starts a POD section (=pod, =head, =over, etc.)
854 // Use byte-safe checks — avoid slicing &str at arbitrary byte positions
855 let remaining = &self.input_bytes[self.position..];
856 if remaining.starts_with(b"=pod")
857 || remaining.starts_with(b"=head")
858 || remaining.starts_with(b"=over")
859 || remaining.starts_with(b"=item")
860 || remaining.starts_with(b"=back")
861 || remaining.starts_with(b"=begin")
862 || remaining.starts_with(b"=end")
863 || remaining.starts_with(b"=for")
864 || remaining.starts_with(b"=encoding")
865 {
866 // Scan forward for \n=cut (end of POD block)
867 let search_start = self.position;
868 let mut found_cut = false;
869 let bytes = self.input_bytes;
870 let mut i = search_start;
871 while i < bytes.len() {
872 // Look for =cut at the start of a line
873 if (i == 0 || matches!(bytes[i - 1], b'\n' | b'\r'))
874 && bytes[i..].starts_with(b"=cut")
875 {
876 i += 4; // Skip "=cut"
877 // Skip rest of the =cut line
878 while i < bytes.len() && bytes[i] != b'\n' && bytes[i] != b'\r' {
879 i += 1;
880 }
881 // Consume one line ending sequence if present
882 if i < bytes.len() && bytes[i] == b'\r' {
883 i += 1;
884 if i < bytes.len() && bytes[i] == b'\n' {
885 i += 1;
886 }
887 } else if i < bytes.len() && bytes[i] == b'\n' {
888 i += 1;
889 }
890 self.position = i;
891 found_cut = true;
892 break;
893 }
894 i += 1;
895 }
896 if !found_cut {
897 // POD extends to end of file
898 self.position = bytes.len();
899 }
900 continue;
901 }
902 // Not a POD directive - regular '=' token
903 break;
904 }
905 _ => {
906 // For non-ASCII whitespace, use char check only when needed
907 if byte >= 128
908 && let Some(ch) = self.current_char()
909 && ch.is_whitespace()
910 {
911 self.advance();
912 continue;
913 }
914 break;
915 }
916 }
917 }
918 Some(())
919 }
920
921 fn try_heredoc(&mut self) -> Option<Token> {
922 // `<<` is the left-shift operator, not a heredoc, when we are inside
923 // a parenthesized expression and have just finished a term.
924 // E.g. `(1<<index(...))` — the `1` sets ExpectOperator and paren_depth > 0,
925 // so `<<index` must be the bitshift operator, not a heredoc start.
926 //
927 // We must NOT fire the guard at statement level (paren_depth == 0) because
928 // `print $fh <<END` is valid Perl: `$fh` sets ExpectOperator but `<<END`
929 // is a heredoc. The depth check distinguishes the two cases.
930 if self.mode == LexerMode::ExpectOperator && self.paren_depth > 0 {
931 return None;
932 }
933
934 // Check for heredoc start
935 if self.peek_byte(0) != Some(b'<') || self.peek_byte(1) != Some(b'<') {
936 return None;
937 }
938
939 let start = self.position;
940 let mut text = String::from("<<");
941 self.position += 2; // Skip <<
942
943 // Check for indented heredoc (~)
944 let allow_indent = if self.current_char() == Some('~') {
945 text.push('~');
946 self.advance();
947 true
948 } else {
949 false
950 };
951
952 // Skip whitespace
953 while let Some(ch) = self.current_char() {
954 if ch == ' ' || ch == '\t' {
955 text.push(ch);
956 self.advance();
957 } else {
958 break;
959 }
960 }
961
962 // Optional backslash disables interpolation, treat like single-quoted label
963 let backslashed = if self.current_char() == Some('\\') {
964 text.push('\\');
965 self.advance();
966 true
967 } else {
968 false
969 };
970
971 // Parse delimiter
972 let delimiter = if self.position < self.input.len() {
973 match self.current_char() {
974 Some('"') if !backslashed => self.parse_quoted_heredoc_delimiter('"', &mut text)?,
975 Some('\'') if !backslashed => {
976 self.parse_quoted_heredoc_delimiter('\'', &mut text)?
977 }
978 Some('`') if !backslashed => self.parse_quoted_heredoc_delimiter('`', &mut text)?,
979 Some(c) if is_perl_identifier_start(c) => {
980 // Bare word delimiter
981 let mut delim = String::new();
982 while self.position < self.input.len() {
983 if let Some(c) = self.current_char() {
984 if is_perl_identifier_continue(c) {
985 delim.push(c);
986 text.push(c);
987 self.advance();
988 } else {
989 break;
990 }
991 } else {
992 break;
993 }
994 }
995 delim
996 }
997 _ => {
998 // Not a valid heredoc delimiter - reset position and return None
999 // This allows << to be parsed as bitshift operator (e.g., 1 << 2)
1000 self.position = start;
1001 return None;
1002 }
1003 }
1004 } else {
1005 // No delimiter found - reset position and return None
1006 self.position = start;
1007 return None;
1008 };
1009
1010 // For now, return a placeholder token
1011 // The actual heredoc body would be parsed later when we encounter it
1012 self.mode = LexerMode::ExpectOperator;
1013
1014 // Recursion depth limit (Issue #443)
1015 if self.pending_heredocs.len() >= MAX_HEREDOC_DEPTH {
1016 return Some(Token {
1017 token_type: TokenType::Error(Arc::from("Heredoc nesting too deep")),
1018 text: Arc::from(text),
1019 start,
1020 end: self.position,
1021 });
1022 }
1023
1024 // Queue the heredoc spec with its label
1025 self.pending_heredocs.push(HeredocSpec {
1026 label: Arc::from(delimiter.as_str()),
1027 body_start: 0, // Will be set when we see the newline after this line
1028 allow_indent,
1029 });
1030
1031 Some(Token {
1032 token_type: TokenType::HeredocStart,
1033 text: Arc::from(text),
1034 start,
1035 end: self.position,
1036 })
1037 }
1038
1039 fn try_string(&mut self) -> Option<Token> {
1040 let start = self.position;
1041 let quote = self.current_char()?;
1042
1043 match quote {
1044 '"' => self.parse_double_quoted_string(start),
1045 '\'' => self.parse_single_quoted_string(start),
1046 '`' => self.parse_backtick_string(start),
1047 'q' if self.peek_char(1) == Some('{') => self.parse_q_string(start),
1048 _ => None,
1049 }
1050 }
1051
1052 #[inline]
1053 fn try_number(&mut self) -> Option<Token> {
1054 let start = self.position;
1055
1056 // Fast byte check for digits - optimized bounds checking
1057 let bytes = self.input_bytes;
1058 if self.position >= bytes.len() || !Self::byte_at(bytes, self.position).is_ascii_digit() {
1059 return None;
1060 }
1061
1062 // Check for hex (0x), binary (0b), or octal (0o) prefixes
1063 let mut pos = self.position;
1064 if Self::byte_at(bytes, pos) == b'0' && pos + 1 < bytes.len() {
1065 let prefix_byte = bytes[pos + 1];
1066 if prefix_byte == b'x' || prefix_byte == b'X' {
1067 // Hexadecimal: 0x[0-9a-fA-F_]+
1068 pos += 2; // consume '0x'
1069 let digit_start = pos;
1070 let mut saw_digit = false;
1071 while pos < bytes.len() && (bytes[pos].is_ascii_hexdigit() || bytes[pos] == b'_') {
1072 saw_digit |= bytes[pos].is_ascii_hexdigit();
1073 pos += 1;
1074 }
1075 if pos > digit_start && saw_digit {
1076 self.position = pos;
1077 let text = &self.input[start..self.position];
1078 self.mode = LexerMode::ExpectOperator;
1079 return Some(Token {
1080 token_type: TokenType::Number(Arc::from(text)),
1081 text: Arc::from(text),
1082 start,
1083 end: self.position,
1084 });
1085 }
1086 // No hex digits after 0x - fall through to parse '0' as decimal
1087 } else if prefix_byte == b'b' || prefix_byte == b'B' {
1088 // Binary: 0b[01_]+
1089 pos += 2; // consume '0b'
1090 let digit_start = pos;
1091 let mut saw_digit = false;
1092 while pos < bytes.len()
1093 && (bytes[pos] == b'0' || bytes[pos] == b'1' || bytes[pos] == b'_')
1094 {
1095 saw_digit |= bytes[pos] == b'0' || bytes[pos] == b'1';
1096 pos += 1;
1097 }
1098 if pos > digit_start && saw_digit {
1099 self.position = pos;
1100 let text = &self.input[start..self.position];
1101 self.mode = LexerMode::ExpectOperator;
1102 return Some(Token {
1103 token_type: TokenType::Number(Arc::from(text)),
1104 text: Arc::from(text),
1105 start,
1106 end: self.position,
1107 });
1108 }
1109 // No binary digits after 0b - fall through to parse '0' as decimal
1110 } else if prefix_byte == b'o' || prefix_byte == b'O' {
1111 // Octal (explicit): 0o[0-7_]+
1112 pos += 2; // consume '0o'
1113 let digit_start = pos;
1114 let mut saw_digit = false;
1115 while pos < bytes.len()
1116 && ((bytes[pos] >= b'0' && bytes[pos] <= b'7') || bytes[pos] == b'_')
1117 {
1118 saw_digit |= (b'0'..=b'7').contains(&bytes[pos]);
1119 pos += 1;
1120 }
1121 if pos > digit_start && saw_digit {
1122 self.position = pos;
1123 let text = &self.input[start..self.position];
1124 self.mode = LexerMode::ExpectOperator;
1125 return Some(Token {
1126 token_type: TokenType::Number(Arc::from(text)),
1127 text: Arc::from(text),
1128 start,
1129 end: self.position,
1130 });
1131 }
1132 // No octal digits after 0o - fall through to parse '0' as decimal
1133 }
1134 }
1135
1136 // Consume initial digits - unrolled for better performance
1137 pos = self.position;
1138 while pos < bytes.len() {
1139 let byte = Self::byte_at(bytes, pos);
1140 if byte.is_ascii_digit() || byte == b'_' {
1141 pos += 1;
1142 } else {
1143 break;
1144 }
1145 }
1146 self.position = pos;
1147
1148 // Check for decimal point - optimized with single bounds check
1149 if pos < bytes.len() && Self::byte_at(bytes, pos) == b'.' {
1150 // Peek ahead to see what follows the dot
1151 let has_following_digit = pos + 1 < bytes.len() && bytes[pos + 1].is_ascii_digit();
1152
1153 // Optimized dot consumption logic
1154 let should_consume_dot = has_following_digit || {
1155 pos + 1 >= bytes.len() || {
1156 // Use bitwise operations for faster character classification
1157 let next_byte = bytes[pos + 1];
1158 // Whitespace, delimiters, operators - optimized check
1159 next_byte <= b' '
1160 || matches!(
1161 next_byte,
1162 b';' | b','
1163 | b')'
1164 | b'}'
1165 | b']'
1166 | b'+'
1167 | b'-'
1168 | b'*'
1169 | b'/'
1170 | b'%'
1171 | b'='
1172 | b'<'
1173 | b'>'
1174 | b'!'
1175 | b'&'
1176 | b'|'
1177 | b'^'
1178 | b'~'
1179 | b'e'
1180 | b'E'
1181 )
1182 }
1183 };
1184
1185 if should_consume_dot {
1186 pos += 1; // consume the dot
1187 // Consume fractional digits - batch processing
1188 while pos < bytes.len() && (bytes[pos].is_ascii_digit() || bytes[pos] == b'_') {
1189 pos += 1;
1190 }
1191 self.position = pos;
1192 }
1193 }
1194
1195 // Check for exponent - optimized
1196 if pos < bytes.len() && (bytes[pos] == b'e' || bytes[pos] == b'E') {
1197 let exp_start = pos;
1198 pos += 1; // consume 'e' or 'E'
1199
1200 // Check for optional sign
1201 if pos < bytes.len() && (bytes[pos] == b'+' || bytes[pos] == b'-') {
1202 pos += 1;
1203 }
1204
1205 // Must have at least one digit after exponent (underscores allowed between digits)
1206 let mut saw_digit = false;
1207 while pos < bytes.len() {
1208 let byte = bytes[pos];
1209 if byte.is_ascii_digit() {
1210 saw_digit = true;
1211 pos += 1;
1212 } else if byte == b'_' {
1213 pos += 1;
1214 } else {
1215 break;
1216 }
1217 }
1218
1219 // If no digits after exponent, backtrack
1220 if !saw_digit {
1221 pos = exp_start;
1222 }
1223
1224 self.position = pos;
1225 }
1226
1227 // Avoid string slicing for common number cases - use Arc::from directly on slice
1228 let text = &self.input[start..self.position];
1229 self.mode = LexerMode::ExpectOperator;
1230
1231 Some(Token {
1232 token_type: TokenType::Number(Arc::from(text)),
1233 text: Arc::from(text),
1234 start,
1235 end: self.position,
1236 })
1237 }
1238
1239 fn parse_decimal_number(&mut self, start: usize) -> Option<Token> {
1240 // We're at the dot, consume it
1241 self.advance();
1242
1243 // Parse the fractional part
1244 while self.position < self.input_bytes.len() {
1245 let byte = self.input_bytes[self.position];
1246 match byte {
1247 b'0'..=b'9' | b'_' => self.position += 1,
1248 b'e' | b'E' => {
1249 // Handle scientific notation.
1250 // Save the position of 'e'/'E' so we can backtrack here if
1251 // no digits follow the exponent marker (with or without sign).
1252 let e_pos = self.position;
1253 self.advance();
1254 if self.position < self.input_bytes.len() {
1255 let next = self.input_bytes[self.position];
1256 if next == b'+' || next == b'-' {
1257 self.advance();
1258 }
1259 }
1260 // Parse exponent digits (underscores allowed between digits)
1261 let exponent_start = self.position;
1262 let mut saw_digit = false;
1263 while self.position < self.input_bytes.len() {
1264 let byte = self.input_bytes[self.position];
1265 if byte.is_ascii_digit() {
1266 saw_digit = true;
1267 self.position += 1;
1268 } else if byte == b'_' {
1269 self.position += 1;
1270 } else {
1271 break;
1272 }
1273 }
1274
1275 // No digits after exponent marker — backtrack to just before
1276 // 'e'/'E' so the caller sees it as a separate token.
1277 // Using e_pos (not exponent_start-1) avoids including 'e' in
1278 // the number slice when a sign character was consumed.
1279 if !saw_digit {
1280 let _ = exponent_start; // mark as intentionally unused
1281 self.position = e_pos;
1282 }
1283 break;
1284 }
1285 _ => break,
1286 }
1287 }
1288
1289 let text = &self.input[start..self.position];
1290 self.mode = LexerMode::ExpectOperator;
1291
1292 Some(Token {
1293 token_type: TokenType::Number(Arc::from(text)),
1294 text: Arc::from(text),
1295 start,
1296 end: self.position,
1297 })
1298 }
1299
1300 fn try_variable(&mut self) -> Option<Token> {
1301 let start = self.position;
1302 let sigil = self.current_char()?;
1303
1304 match sigil {
1305 '$' | '@' | '%' | '*' => {
1306 // In ExpectOperator mode, treat % and * as operators rather than sigils
1307 if self.mode == LexerMode::ExpectOperator && matches!(sigil, '*' | '%') {
1308 return None;
1309 }
1310 self.advance();
1311
1312 // Special case: After ->, sigils followed by { or [ should be tokenized separately
1313 // This is for postfix dereference like ->@*, ->%{}, ->@[]
1314 // We need to be careful with Unicode - check if we have enough bytes and valid char boundaries
1315 let check_arrow = self.position >= 3
1316 && self.position.saturating_sub(1) <= self.input.len()
1317 && self.input.is_char_boundary(self.position.saturating_sub(3))
1318 && self.input.is_char_boundary(self.position.saturating_sub(1));
1319
1320 if check_arrow
1321 && {
1322 let saved = self.position;
1323 self.position -= 3;
1324 let arrow = self.matches_bytes(b"->");
1325 self.position = saved;
1326 arrow
1327 }
1328 && matches!(self.current_char(), Some('{' | '[' | '*'))
1329 {
1330 // Just return the sigil
1331 let text = &self.input[start..self.position];
1332 self.mode = LexerMode::ExpectOperator;
1333
1334 return Some(Token {
1335 token_type: TokenType::Identifier(Arc::from(text)),
1336 text: Arc::from(text),
1337 start,
1338 end: self.position,
1339 });
1340 }
1341
1342 // Check for $# (array length operator)
1343 if sigil == '$' && self.current_char() == Some('#') {
1344 self.advance(); // consume #
1345 // Now parse the array name
1346 while let Some(ch) = self.current_char() {
1347 if is_perl_identifier_continue(ch) {
1348 self.advance();
1349 } else if ch == ':' && self.peek_char(1) == Some(':') {
1350 // Package-qualified array name
1351 self.advance();
1352 self.advance();
1353 } else {
1354 break;
1355 }
1356 }
1357
1358 let text = &self.input[start..self.position];
1359 self.mode = LexerMode::ExpectOperator;
1360 // $#foo is a complete variable token; a following `{` is a subscript.
1361 self.after_var_subscript = true;
1362
1363 return Some(Token {
1364 token_type: TokenType::Identifier(Arc::from(text)),
1365 text: Arc::from(text),
1366 start,
1367 end: self.position,
1368 });
1369 }
1370
1371 // Check for special cases like ${^MATCH} or ${::{foo}} or *{$glob}
1372 if self.current_char() == Some('{') {
1373 // Peek ahead to decide if we should consume the brace
1374 let next_char = self.peek_char(1);
1375
1376 // Check if this is a dereference like @{$ref} or @{[...]}
1377 // If the next char suggests dereference, don't consume the brace.
1378 // For @ and % sigils, identifiers inside braces are also derefs
1379 // (e.g. @{Foo::Bar::baz} or %{Some::Hash}).
1380 let is_deref = sigil != '*'
1381 && (matches!(
1382 next_char,
1383 Some('$' | '@' | '%' | '*' | '&' | '[' | ' ' | '\t' | '\n' | '\r',)
1384 ) || (matches!(sigil, '@' | '%')
1385 && next_char.is_some_and(is_perl_identifier_start)));
1386 if is_deref {
1387 // This is a dereference, don't consume the brace
1388 let text = &self.input[start..self.position];
1389 self.mode = LexerMode::ExpectOperator;
1390 // A standalone sigil token before `{` starts a dereference
1391 // sequence (e.g. `${$ref}` / `@{$aref}` / `%{$href}` / `&{$cref}`).
1392 // Mark it as subscript-capable so `{` increments brace depth
1393 // and the closing `}` can enable chained `{...}` subscripts.
1394 // (Broader form than master's `$|@|%` filter — `*` is already
1395 // excluded by the `is_deref` guard above and `&` deref also
1396 // benefits from chained-subscript handling.)
1397 self.after_var_subscript = true;
1398
1399 return Some(Token {
1400 token_type: TokenType::Identifier(Arc::from(text)),
1401 text: Arc::from(text),
1402 start,
1403 end: self.position,
1404 });
1405 }
1406
1407 self.advance(); // consume {
1408
1409 // Handle special variables with caret
1410 if self.current_char() == Some('^') {
1411 self.advance(); // consume ^
1412 // Parse the special variable name
1413 while let Some(ch) = self.current_char() {
1414 if ch == '}' {
1415 self.advance(); // consume }
1416 break;
1417 } else if is_perl_identifier_continue(ch) {
1418 self.advance();
1419 } else {
1420 break;
1421 }
1422 }
1423 }
1424 // Handle stash access like $::{foo}
1425 else if self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1426 self.advance(); // consume first :
1427 self.advance(); // consume second :
1428 // Skip optional { and }
1429 if self.current_char() == Some('{') {
1430 self.advance();
1431 }
1432 // Parse the name
1433 while let Some(ch) = self.current_char() {
1434 if ch == '}' {
1435 self.advance();
1436 if self.current_char() == Some('}') {
1437 self.advance(); // consume closing } of ${...}
1438 }
1439 break;
1440 } else if is_perl_identifier_continue(ch) {
1441 self.advance();
1442 } else {
1443 break;
1444 }
1445 }
1446 }
1447 // Regular braced variable like ${foo} or glob like *{$glob}
1448 else {
1449 // Check if this is a dereference like ${$ref} or @{$ref} or @{[...]}
1450 // If the next char is a sigil or other expression starter, we should stop here and let the parser handle it
1451 // EXCEPT for globs - *{$glob} should be parsed as one token
1452 // Also check for empty braces or EOF - in these cases we should split the tokens
1453 if sigil != '*'
1454 && (matches!(
1455 self.current_char(),
1456 Some(
1457 '$' | '@'
1458 | '%'
1459 | '*'
1460 | '&'
1461 | '['
1462 | ' '
1463 | '\t'
1464 | '\n'
1465 | '\r'
1466 | '}'
1467 )
1468 ) || self.current_char().is_none())
1469 {
1470 // This is a dereference or empty/invalid brace, backtrack
1471 self.position = start + 1; // Just past the sigil
1472 let text = &self.input[start..self.position];
1473 self.mode = LexerMode::ExpectOperator;
1474 // Same as above: sigil-only token means a dereference opener.
1475 self.after_var_subscript = true;
1476
1477 return Some(Token {
1478 token_type: TokenType::Identifier(Arc::from(text)),
1479 text: Arc::from(text),
1480 start,
1481 end: self.position,
1482 });
1483 }
1484
1485 // For glob access, we need to consume everything inside braces
1486 if sigil == '*' {
1487 let mut brace_depth: usize = 1;
1488 while let Some(ch) = self.current_char() {
1489 if ch == '{' {
1490 brace_depth += 1;
1491 } else if ch == '}' {
1492 brace_depth = brace_depth.saturating_sub(1);
1493 if brace_depth == 0 {
1494 self.advance(); // consume final }
1495 break;
1496 }
1497 }
1498 self.advance();
1499 }
1500 } else {
1501 // Regular variable
1502 while let Some(ch) = self.current_char() {
1503 if ch == '}' {
1504 self.advance(); // consume }
1505 break;
1506 } else if is_perl_identifier_continue(ch) {
1507 self.advance();
1508 } else {
1509 break;
1510 }
1511 }
1512 }
1513 }
1514 }
1515 // Parse regular variable name
1516 else if let Some(ch) = self.current_char() {
1517 if is_perl_identifier_start(ch) {
1518 while let Some(ch) = self.current_char() {
1519 if is_perl_identifier_continue(ch) {
1520 self.advance();
1521 } else {
1522 break;
1523 }
1524 }
1525 // Handle package-qualified segments like Foo::bar
1526 while self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1527 self.advance();
1528 self.advance();
1529 while let Some(ch) = self.current_char() {
1530 if is_perl_identifier_continue(ch) {
1531 self.advance();
1532 } else {
1533 break;
1534 }
1535 }
1536 }
1537 }
1538 // Handle $^Letter (e.g. $^W, $^O, $^X) and bare $^ (format_top_name)
1539 // Not inside prototypes where ^ is a literal prototype char
1540 else if sigil == '$' && ch == '^' && !self.in_prototype {
1541 self.advance(); // consume ^
1542 // $^Letter: consume the single uppercase letter
1543 if let Some(letter) = self.current_char()
1544 && letter.is_ascii_uppercase()
1545 {
1546 self.advance();
1547 }
1548 // bare $^ (no uppercase letter follows): format_top_name — stop here
1549 }
1550 // Handle special punctuation variables
1551 // Not inside prototypes where ; and , are literal prototype chars
1552 else if sigil == '$'
1553 && !self.in_prototype
1554 && matches!(
1555 ch,
1556 '?' | '!'
1557 | '@'
1558 | '&'
1559 | '`'
1560 | '\''
1561 | '.'
1562 | '/'
1563 | '\\'
1564 | '|'
1565 | '+'
1566 | '-'
1567 | '['
1568 | ']'
1569 | '$'
1570 | '~'
1571 | '='
1572 | '%'
1573 | ','
1574 | '"'
1575 | ';'
1576 | '>'
1577 | '<'
1578 | ')'
1579 | '(' // $( = real group ID of this process
1580 )
1581 {
1582 self.advance(); // consume the special character
1583 }
1584 // $$ is the PID special variable, but only when it is not immediately
1585 // followed by an identifier-start character. $$var is scalar dereference
1586 // of $var, so keep the second $ for the next token.
1587 else if sigil == '$' && ch == '$' {
1588 if !self.peek_char(1).is_some_and(is_perl_identifier_start) {
1589 self.advance(); // consume the second $ for bare $$ PID
1590 }
1591 }
1592 // Handle special array/hash punctuation variables
1593 else if (sigil == '@' || sigil == '%') && matches!(ch, '+' | '-') {
1594 self.advance(); // consume the + or -
1595 }
1596 }
1597
1598 let text = &self.input[start..self.position];
1599 self.mode = LexerMode::ExpectOperator;
1600 // A complete $foo, @foo, %foo token can be followed by a hash/slice
1601 // subscript `{`. Set the flag so the `{` handler knows to increment
1602 // hash_brace_depth. Glob tokens (*foo) are excluded: they don't take
1603 // hash subscripts in the same way.
1604 self.after_var_subscript = matches!(sigil, '$' | '@' | '%');
1605
1606 Some(Token {
1607 token_type: TokenType::Identifier(Arc::from(text)),
1608 text: Arc::from(text),
1609 start,
1610 end: self.position,
1611 })
1612 }
1613 _ => None,
1614 }
1615 }
1616
1617 /// Return the next non-space char and the char immediately following it (without consuming).
1618 /// Used to detect quote-operator delimiters while distinguishing `=>` (fat-arrow autoquote)
1619 /// from `=` used as a plain delimiter.
1620 fn peek_nonspace_and_following(&self) -> (Option<char>, Option<char>) {
1621 let mut i = self.position;
1622 while i < self.input.len() {
1623 let c = match self.input.get(i..).and_then(|s| s.chars().next()) {
1624 Some(c) => c,
1625 None => return (None, None),
1626 };
1627 if c.is_whitespace() {
1628 i += c.len_utf8();
1629 continue;
1630 }
1631 // Found non-space at position i; peek the next char after it
1632 let j = i + c.len_utf8();
1633 let following = self.input.get(j..).and_then(|s| s.chars().next());
1634 return (Some(c), following);
1635 }
1636 (None, None)
1637 }
1638
1639 /// Is `c` a valid quote-like delimiter? (non-alnum, including paired)
1640 fn is_quote_delim(c: char) -> bool {
1641 // Perl allows any non-alphanumeric, non-whitespace character as delimiter,
1642 // including control characters (e.g. s\x07pattern\x07replacement\x07).
1643 !c.is_ascii_alphanumeric() && !c.is_whitespace()
1644 }
1645
1646 /// Try to parse a v-string (version string) like `v5.26.0` or `v5.10`.
1647 ///
1648 /// A v-string starts with `v` followed by one or more digits, then optionally
1649 /// `.` followed by digits, repeated. The `v` prefix distinguishes these from
1650 /// normal identifiers. Examples: `v5.26.0`, `v5.10`, `v1.2.3.4`.
1651 #[inline]
1652 fn try_vstring(&mut self) -> Option<Token> {
1653 let start = self.position;
1654 let bytes = self.input_bytes;
1655
1656 // Must start with 'v' followed by at least one digit
1657 if start >= bytes.len() || bytes[start] != b'v' {
1658 return None;
1659 }
1660
1661 let next_pos = start + 1;
1662 if next_pos >= bytes.len() || !bytes[next_pos].is_ascii_digit() {
1663 return None;
1664 }
1665
1666 // We have `v` followed by a digit — scan the rest of the v-string.
1667 // Pattern: v DIGITS (.DIGITS)*
1668 let mut pos = next_pos;
1669
1670 // Consume leading digits
1671 while pos < bytes.len() && bytes[pos].is_ascii_digit() {
1672 pos += 1;
1673 }
1674
1675 // Consume optional `.DIGITS` segments (require at least one digit after dot)
1676 while pos < bytes.len() && bytes[pos] == b'.' {
1677 let dot_pos = pos;
1678 pos += 1; // skip '.'
1679
1680 if pos >= bytes.len() || !bytes[pos].is_ascii_digit() {
1681 // Dot not followed by digit — not part of the v-string
1682 pos = dot_pos;
1683 break;
1684 }
1685
1686 // Consume digits after the dot
1687 while pos < bytes.len() && bytes[pos].is_ascii_digit() {
1688 pos += 1;
1689 }
1690 }
1691
1692 // Make sure the v-string isn't followed by identifier-continuation characters
1693 // (e.g. `v5x` should remain an identifier, not a v-string `v5` + `x`)
1694 if pos < bytes.len() {
1695 let next_byte = bytes[pos];
1696 if next_byte == b'_' || next_byte.is_ascii_alphabetic() {
1697 return None;
1698 }
1699 // Also check for non-ASCII identifier continuations
1700 if next_byte >= 128
1701 && let Some(ch) = self.input.get(pos..).and_then(|s| s.chars().next())
1702 && is_perl_identifier_continue(ch)
1703 {
1704 return None;
1705 }
1706 }
1707
1708 // `v5` (no dots) is a valid Perl v-string meaning chr(5).
1709 let text = &self.input[start..pos];
1710
1711 self.position = pos;
1712 self.mode = LexerMode::ExpectOperator;
1713
1714 Some(Token {
1715 token_type: TokenType::Version(Arc::from(text)),
1716 text: Arc::from(text),
1717 start,
1718 end: self.position,
1719 })
1720 }
1721
1722 #[inline]
1723 fn try_identifier_or_keyword(&mut self) -> Option<Token> {
1724 let start = self.position;
1725 let ch = self.current_char()?;
1726 let bytes = self.input_bytes;
1727 let len = bytes.len();
1728
1729 if is_perl_identifier_start(ch) {
1730 // Special case: substitution/transliteration with single-quote delimiter
1731 // The single quote is considered an identifier continuation, so we need to
1732 // detect these operators before consuming it as part of an identifier.
1733 if !self.after_arrow
1734 && self.hash_brace_depth == 0
1735 && ch == 's'
1736 && self.peek_char(1) == Some('\'')
1737 {
1738 self.advance(); // consume 's'
1739 return self.parse_substitution(start);
1740 } else if !self.after_arrow
1741 && self.hash_brace_depth == 0
1742 && ch == 'y'
1743 && self.peek_char(1) == Some('\'')
1744 {
1745 self.advance(); // consume 'y'
1746 return self.parse_transliteration(start);
1747 } else if !self.after_arrow
1748 && self.hash_brace_depth == 0
1749 && ch == 't'
1750 && self.peek_char(1) == Some('r')
1751 && self.peek_char(2) == Some('\'')
1752 {
1753 self.advance(); // consume 't'
1754 self.advance(); // consume 'r'
1755 return self.parse_transliteration(start);
1756 }
1757
1758 // Fast ASCII path for identifier continuation.
1759 while self.position < len {
1760 let byte = bytes[self.position];
1761 if byte == b'\'' && is_quote_op_word_prefix(&bytes[start..self.position]) {
1762 // Keep apostrophe for quote-operator parsing in cases like q'...'.
1763 break;
1764 }
1765
1766 if byte.is_ascii_alphanumeric() || byte == b'_' || byte == b'\'' {
1767 self.position += 1;
1768 continue;
1769 }
1770
1771 if byte < 128 {
1772 break;
1773 }
1774
1775 if let Some(ch) = self.current_char()
1776 && is_perl_identifier_continue(ch)
1777 {
1778 self.advance();
1779 continue;
1780 }
1781 break;
1782 }
1783 // Handle package-qualified identifiers like Foo::bar.
1784 while self.config.max_lookahead >= 1
1785 && self.position + 1 < len
1786 && bytes[self.position] == b':'
1787 && bytes[self.position + 1] == b':'
1788 {
1789 self.position += 2; // consume '::'
1790
1791 // consume following identifier segment if present
1792 let Some(ch) = self.current_char() else {
1793 break;
1794 };
1795 if !is_perl_identifier_start(ch) {
1796 break;
1797 }
1798 self.advance();
1799 while self.position < len {
1800 let byte = bytes[self.position];
1801 if byte.is_ascii_alphanumeric() || byte == b'_' || byte == b'\'' {
1802 self.position += 1;
1803 continue;
1804 }
1805 if byte < 128 {
1806 break;
1807 }
1808 if let Some(ch) = self.current_char()
1809 && is_perl_identifier_continue(ch)
1810 {
1811 self.advance();
1812 continue;
1813 }
1814 break;
1815 }
1816 }
1817
1818 let text = &self.input[start..self.position];
1819
1820 // Check for __DATA__ and __END__ markers using exact match
1821 // Only recognize these in code channel, not inside data/format sections or heredocs
1822 let in_code_channel =
1823 !matches!(self.mode, LexerMode::InDataSection | LexerMode::InFormatBody)
1824 && self.pending_heredocs.is_empty();
1825
1826 let marker = if in_code_channel {
1827 if text == "__DATA__" {
1828 Some("__DATA__")
1829 } else if text == "__END__" {
1830 Some("__END__")
1831 } else {
1832 None
1833 }
1834 } else {
1835 None
1836 };
1837
1838 if let Some(marker_text) = marker {
1839 // These must be at the beginning of a line
1840 // Use the after_newline flag to determine if we're at line start
1841 if self.after_newline {
1842 // Check if rest of line is only whitespace
1843 // Only treat as data marker if line has no trailing junk
1844 if Self::trailing_ws_only(self.input_bytes, self.position) {
1845 // Consume the rest of the line (the marker line)
1846 while self.position < self.input.len()
1847 && self.input_bytes[self.position] != b'\n'
1848 && self.input_bytes[self.position] != b'\r'
1849 {
1850 self.advance();
1851 }
1852 self.consume_newline();
1853
1854 // Switch to data section mode
1855 self.mode = LexerMode::InDataSection;
1856
1857 return Some(Token {
1858 token_type: TokenType::DataMarker(Arc::from(marker_text)),
1859 text: Arc::from(marker_text),
1860 start,
1861 end: self.position,
1862 });
1863 }
1864 }
1865 }
1866
1867 // Check for substitution/transliteration operators
1868 // Skip if after '->' -- these are method names, not operators.
1869 #[allow(clippy::collapsible_if)]
1870 if !self.after_arrow && self.hash_brace_depth == 0 && matches!(text, "s" | "tr" | "y") {
1871 let immediate = self.current_char();
1872 let (candidate, char_after_next, has_whitespace) =
1873 if immediate.is_some_and(|c| c.is_whitespace()) {
1874 let (nc, ca) = self.peek_nonspace_and_following();
1875 (nc, ca, true)
1876 } else {
1877 let following = immediate.and_then(|c| {
1878 let j = self.position + c.len_utf8();
1879 self.input.get(j..).and_then(|s| s.chars().next())
1880 });
1881 (immediate, following, false)
1882 };
1883
1884 if let Some(next) = candidate {
1885 // `s => 1` should remain a fat-arrow hash key, not quote op.
1886 let is_fat_arrow = next == '=' && char_after_next == Some('>');
1887 let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
1888 let is_quote_char = matches!(next, '\'' | '"') && text != "s";
1889 let transliteration_allows_whitespace = text == "tr" || text == "y";
1890 let substitution_disallows_whitespace = text == "s" && has_whitespace;
1891 let is_valid_delim = Self::is_quote_delim(next)
1892 && !is_fat_arrow
1893 && !substitution_disallows_whitespace
1894 && (!has_whitespace
1895 || is_paired_delim
1896 || is_quote_char
1897 || transliteration_allows_whitespace);
1898
1899 if is_valid_delim {
1900 match text {
1901 "s" => return self.parse_substitution(start),
1902 "tr" | "y" => return self.parse_transliteration(start),
1903 unexpected => {
1904 return Some(Token {
1905 token_type: TokenType::Error(Arc::from(format!(
1906 "Unexpected substitution operator '{}': expected 's', 'tr', or 'y' at position {}",
1907 unexpected, start
1908 ))),
1909 text: Arc::from(unexpected),
1910 start,
1911 end: self.position,
1912 });
1913 }
1914 }
1915 }
1916 }
1917 }
1918
1919 let token_type = if is_keyword_fast(text) {
1920 // Check for special keywords that affect lexer mode
1921 match text {
1922 "if" | "unless" | "while" | "until" | "for" | "foreach" | "grep" | "map"
1923 | "sort" | "split" | "and" | "or" | "xor" | "not"
1924 // These keywords introduce an expression, so a following `/` is a
1925 // regex, not division. `return /re/`, `die /re/`, `warn /re/`,
1926 // `do /file/`, and `eval /re/` are all valid Perl.
1927 | "return" | "die" | "warn" | "do" | "eval" => {
1928 self.mode = LexerMode::ExpectTerm;
1929 }
1930 "sub" => {
1931 self.after_sub = true;
1932 self.mode = LexerMode::ExpectTerm;
1933 }
1934 // Quote operators expect a delimiter next.
1935 // Skip if after '->' -- these are method names, not operators.
1936 // Skip inside hash subscript braces (hash_brace_depth > 0) — all
1937 // positions inside `$h{...}` or `@h{...}` treat quote-op names as
1938 // bareword keys, including after commas in slices like `@h{m, s}`.
1939 op if !self.after_arrow
1940 && self.hash_brace_depth == 0
1941 && quote_handler::is_quote_operator(op) =>
1942 {
1943 // Perl allows whitespace between a quote-like operator and its delimiter,
1944 // but ONLY for paired delimiters (s { ... } { ... }g).
1945 // For non-paired delimiters (s/foo/bar/, s,foo,bar,), the delimiter
1946 // must be immediately adjacent — otherwise `s $foo` would wrongly
1947 // treat `$` as a delimiter instead of being a bareword `s` followed
1948 // by a scalar variable.
1949 //
1950 // Strategy:
1951 // 1. Check the immediately-adjacent char first (no whitespace skip).
1952 // If it is a valid delimiter → any non-alnum, non-whitespace char.
1953 // 2. If the adjacent char is whitespace, peek past it.
1954 // Only accept PAIRED delimiters ({, [, (, <) in that case.
1955 let immediate = self.current_char();
1956 let (candidate, char_after_next, has_whitespace) =
1957 if immediate.is_some_and(|c| c.is_whitespace()) {
1958 // There is whitespace — peek past it
1959 let (nc, ca) = self.peek_nonspace_and_following();
1960 (nc, ca, true)
1961 } else {
1962 // No whitespace — use immediate char
1963 let following = immediate.and_then(|c| {
1964 let j = self.position + c.len_utf8();
1965 self.input.get(j..).and_then(|s| s.chars().next())
1966 });
1967 (immediate, following, false)
1968 };
1969
1970 if let Some(next) = candidate {
1971 // Fat-arrow autoquoting: `s => value` — `=` followed by `>` is '=>',
1972 // not a valid substitution delimiter. Treat as identifier.
1973 let is_fat_arrow = next == '=' && char_after_next == Some('>');
1974
1975 // When whitespace precedes the delimiter, only unambiguous
1976 // delimiters are accepted:
1977 // - Paired delimiters ({, [, (, <) are always safe.
1978 // - ' and " are safe for all operators EXCEPT `s` — `-s 'filename'`
1979 // is a valid file-size filetest and must not be treated as a
1980 // substitution start. All other operators (qw, q, qq, qr, qx, m,
1981 // tr, y) have no corresponding file-test operator.
1982 // - Non-paired, non-quote chars ($, @, ,, etc.) remain rejected.
1983 let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
1984 let is_quote_char = matches!(next, '\'' | '"') && op != "s";
1985 let is_valid_delim = Self::is_quote_delim(next)
1986 && !is_fat_arrow
1987 && (!has_whitespace || is_paired_delim || is_quote_char);
1988
1989 if is_valid_delim {
1990 self.mode = LexerMode::ExpectDelimiter;
1991 self.current_quote_op = Some(quote_handler::QuoteOperatorInfo {
1992 operator: op.to_string(),
1993 delimiter: '\0', // Will be set when we see the delimiter
1994 start_pos: start,
1995 });
1996
1997 // Don't return a keyword token - continue to parse the delimiter
1998 // Skip any whitespace between operator and delimiter
1999 while let Some(ch) = self.current_char() {
2000 if ch.is_whitespace() {
2001 self.advance();
2002 } else {
2003 break;
2004 }
2005 }
2006
2007 // Get the delimiter
2008 #[allow(clippy::collapsible_if)]
2009 if let Some(delim) = self.current_char() {
2010 if !delim.is_alphanumeric() {
2011 self.advance();
2012 if let Some(ref mut info) = self.current_quote_op {
2013 info.delimiter = delim;
2014 }
2015 // Parse the quote operator content and return the complete token
2016 return self.parse_quote_operator(delim);
2017 }
2018 }
2019 } else {
2020 // Not a quote operator here → treat as IDENTIFIER
2021 self.current_quote_op = None;
2022 self.mode = LexerMode::ExpectOperator;
2023 return Some(Token {
2024 token_type: TokenType::Identifier(Arc::from(text)),
2025 start,
2026 end: self.position,
2027 text: Arc::from(text),
2028 });
2029 }
2030 } else {
2031 // End-of-input after the word → also treat as IDENTIFIER
2032 self.current_quote_op = None;
2033 self.mode = LexerMode::ExpectOperator;
2034 return Some(Token {
2035 token_type: TokenType::Identifier(Arc::from(text)),
2036 start,
2037 end: self.position,
2038 text: Arc::from(text),
2039 });
2040 }
2041 // If we get here but haven't returned, something went wrong
2042 // Fall through to treat as identifier
2043 self.current_quote_op = None;
2044 self.mode = LexerMode::ExpectOperator;
2045 return Some(Token {
2046 token_type: TokenType::Identifier(Arc::from(text)),
2047 start,
2048 end: self.position,
2049 text: Arc::from(text),
2050 });
2051 }
2052 // Format declarations need special handling
2053 "format" => {
2054 // We'll need to check for the = after the format name
2055 // For now, just mark that we saw format
2056 }
2057 _ if is_builtin_function(text) => {
2058 // Bare builtins are term-introducing in Perl.
2059 self.mode = LexerMode::ExpectTerm;
2060 }
2061 _ => {
2062 self.mode = LexerMode::ExpectOperator;
2063 }
2064 }
2065 TokenType::Keyword(Arc::from(text))
2066 } else {
2067 // Mirror parser bare-builtin handling so `/` after builtins like
2068 // `join` or `print` is lexed as a regex term, not division.
2069 if is_builtin_function(text) {
2070 self.mode = LexerMode::ExpectTerm;
2071 } else {
2072 self.mode = LexerMode::ExpectOperator;
2073 }
2074 TokenType::Identifier(Arc::from(text))
2075 };
2076
2077 self.after_arrow = false;
2078 // A keyword/identifier is not a variable; `{` after it is a block opener.
2079 self.after_var_subscript = false;
2080 // hash_brace_depth is managed by { and } handlers, not cleared per-token
2081 Some(Token { token_type, text: Arc::from(text), start, end: self.position })
2082 } else {
2083 None
2084 }
2085 }
2086
2087 /// Parse data section body - consumes everything to EOF
2088 fn parse_data_body(&mut self) -> Option<Token> {
2089 if self.position >= self.input.len() {
2090 // Already at EOF
2091 self.mode = LexerMode::ExpectTerm;
2092 return Some(Token {
2093 token_type: TokenType::EOF,
2094 text: Arc::from(""),
2095 start: self.position,
2096 end: self.position,
2097 });
2098 }
2099
2100 let start = self.position;
2101 // Consume everything to EOF
2102 let body = &self.input[self.position..];
2103 self.position = self.input.len();
2104
2105 // Reset mode for next parse (though we're at EOF)
2106 self.mode = LexerMode::ExpectTerm;
2107
2108 Some(Token {
2109 token_type: TokenType::DataBody(Arc::from(body)),
2110 text: Arc::from(body),
2111 start,
2112 end: self.position,
2113 })
2114 }
2115
2116 /// Parse format body - consumes until a line with just a dot
2117 fn parse_format_body(&mut self) -> Option<Token> {
2118 let start = self.position;
2119 let mut body = String::new();
2120 let mut line_start = true;
2121
2122 while self.position < self.input.len() {
2123 // Check if we're at the start of a line and the next char is a dot
2124 if line_start && self.current_char() == Some('.') {
2125 // Check if this line contains only a dot
2126 let mut peek_pos = self.position + 1;
2127 let mut found_terminator = true;
2128
2129 // Skip any trailing whitespace on the dot line
2130 while peek_pos < self.input.len() {
2131 match self.input_bytes[peek_pos] {
2132 b' ' | b'\t' | b'\r' => peek_pos += 1,
2133 b'\n' => break,
2134 _ => {
2135 found_terminator = false;
2136 break;
2137 }
2138 }
2139 }
2140
2141 if found_terminator {
2142 // We found the terminating dot, consume it
2143 self.position = peek_pos;
2144 if self.position < self.input.len() && self.input_bytes[self.position] == b'\n'
2145 {
2146 self.position += 1;
2147 }
2148
2149 // Switch back to normal mode
2150 self.mode = LexerMode::ExpectTerm;
2151
2152 return Some(Token {
2153 token_type: TokenType::FormatBody(Arc::from(body.clone())),
2154 text: Arc::from(body),
2155 start,
2156 end: self.position,
2157 });
2158 }
2159 }
2160
2161 // Not a terminator, consume the character
2162 match self.current_char() {
2163 Some(ch) => {
2164 body.push(ch);
2165 self.advance();
2166
2167 // Track if we're at the start of a line
2168 line_start = ch == '\n';
2169 }
2170 None => {
2171 // Reached EOF without finding terminator
2172 break;
2173 }
2174 }
2175 }
2176
2177 // If we reach here, we didn't find a terminator
2178 self.mode = LexerMode::ExpectTerm;
2179 Some(Token {
2180 token_type: TokenType::Error(Arc::from("Unterminated format body")),
2181 text: Arc::from(body),
2182 start,
2183 end: self.position,
2184 })
2185 }
2186
2187 fn try_operator(&mut self) -> Option<Token> {
2188 // Skip operator parsing if we're expecting a delimiter for a quote operator
2189 if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2190 return None;
2191 }
2192
2193 let start = self.position;
2194 let ch = self.current_char()?;
2195
2196 // ═══════════════════════════════════════════════════════════════════════
2197 // SLASH DISAMBIGUATION STRATEGY (Issue #422)
2198 // ═══════════════════════════════════════════════════════════════════════
2199 //
2200 // Perl's `/` character is ambiguous:
2201 // - Division operator: `$x / 2`
2202 // - Regex delimiter: `/pattern/`
2203 // - Defined-or operator: `$x // $y`
2204 //
2205 // **Disambiguation Strategy (Context-Aware Heuristics):**
2206 //
2207 // 1. **Mode-Based Decision (Primary)**:
2208 // - `LexerMode::ExpectTerm` → `/` starts a regex
2209 // Examples: `if (/pattern/)`, `=~ /test/`, `( /regex/`
2210 // - `LexerMode::ExpectOperator` → `/` is division or `//`
2211 // Examples: `$x / 2`, `$x // $y`, `) / 3`
2212 //
2213 // 2. **Context Heuristics (Secondary - Implicit in Mode)**:
2214 // Mode is set based on previous token:
2215 // - After identifier/number/closing paren → ExpectOperator → division
2216 // - After operator/keyword/opening paren → ExpectTerm → regex
2217 //
2218 // 3. **Budget Protection**:
2219 // - Regex parsing has a parse-step budget and byte budget
2220 // - Budget exceeded → emit UnknownRest token (graceful degradation)
2221 // - See `parse_regex()` and `budget_guard()` for implementation
2222 //
2223 // 4. **Performance Characteristics**:
2224 // - Single-pass: O(1) decision based on mode flag
2225 // - No backtracking: Mode updated after each token
2226 // - Optimized: Byte-level operations for common cases
2227 //
2228 // **Metrics & Monitoring**:
2229 // - Budget exceeded events tracked via UnknownRest token emission
2230 // - LSP diagnostics generated for truncated regexes
2231 // - Test coverage: lexer_slash_timeout_tests.rs (21 test cases)
2232 //
2233 // ═══════════════════════════════════════════════════════════════════════
2234
2235 if ch == '/' {
2236 if self.mode == LexerMode::ExpectTerm {
2237 // Mode indicates we're expecting a term → `/` starts a regex
2238 // Examples: `if (/pattern/)`, `=~ /test/`, `while (/match/)`
2239 return self.parse_regex(start);
2240 } else {
2241 // Mode indicates we're expecting an operator → `/` is division or `//`
2242 // Examples: `$x / 2`, `$x // $y`, `10 / 3`
2243 self.advance();
2244 // Check for // or //= using byte-level operations for speed
2245 if self.peek_byte(0) == Some(b'/') {
2246 self.position += 1; // consume second / directly
2247 if self.peek_byte(0) == Some(b'=') {
2248 self.position += 1; // consume = directly
2249 let text = &self.input[start..self.position];
2250 self.mode = LexerMode::ExpectTerm;
2251 return Some(Token {
2252 token_type: TokenType::Operator(Arc::from(text)),
2253 text: Arc::from(text),
2254 start,
2255 end: self.position,
2256 });
2257 } else {
2258 // Use cached string for common "//" operator
2259 self.mode = LexerMode::ExpectTerm;
2260 return Some(Token {
2261 token_type: TokenType::Operator(Arc::from("//")),
2262 text: Arc::from("//"),
2263 start,
2264 end: self.position,
2265 });
2266 }
2267 } else if self.position < self.input_bytes.len()
2268 && self.input_bytes[self.position] == b'='
2269 {
2270 // /= division-assign operator
2271 self.position += 1; // consume =
2272 self.mode = LexerMode::ExpectTerm;
2273 return Some(Token {
2274 token_type: TokenType::Operator(Arc::from("/=")),
2275 text: Arc::from("/="),
2276 start,
2277 end: self.position,
2278 });
2279 } else {
2280 // Use cached string for common "/" division
2281 self.mode = LexerMode::ExpectTerm;
2282 return Some(Token {
2283 token_type: TokenType::Division,
2284 text: Arc::from("/"),
2285 start,
2286 end: self.position,
2287 });
2288 }
2289 }
2290 }
2291
2292 // Handle other operators - simplified
2293 match ch {
2294 '.' => {
2295 // Check if it's a decimal number like .5 -- but only when we
2296 // expect a term. In operator position `.5` is concatenation
2297 // of the bareword/number on the left with the number `5`.
2298 if self.mode != LexerMode::ExpectOperator
2299 && self.peek_char(1).is_some_and(|c| c.is_ascii_digit())
2300 {
2301 return self.parse_decimal_number(start);
2302 }
2303 self.advance();
2304 // Check for compound operators
2305 #[allow(clippy::collapsible_if)]
2306 if let Some(next) = self.current_char() {
2307 if is_compound_operator(ch, next) {
2308 self.advance();
2309
2310 // Check for three-character operators like **=, <<=, >>=
2311 if self.position < self.input.len() {
2312 let third = self.current_char();
2313 // Check for three-character operators
2314 if matches!(
2315 (ch, next, third),
2316 ('*', '*', Some('='))
2317 | ('<', '<', Some('='))
2318 | ('>', '>', Some('='))
2319 | ('&', '&', Some('='))
2320 | ('|', '|', Some('='))
2321 | ('/', '/', Some('='))
2322 ) {
2323 self.advance(); // consume the =
2324 } else if ch == '<' && next == '=' && third == Some('>') {
2325 self.advance(); // consume the >
2326 // Special case: <=> spaceship operator
2327 } else if ch == '.' && next == '.' && third == Some('.') {
2328 self.advance(); // consume the third .
2329 }
2330 }
2331 }
2332 }
2333 }
2334 '+' | '-' | '*' | '%' | '&' | '|' | '^' | '~' | '!' | '=' | '<' | '>' | ':' | '?'
2335 | '\\' => {
2336 self.advance();
2337 // Check for compound operators
2338 #[allow(clippy::collapsible_if)]
2339 if let Some(next) = self.current_char() {
2340 if is_compound_operator(ch, next) {
2341 self.advance();
2342
2343 // Check for three-character operators like **=, <<=, >>=
2344 if self.position < self.input.len() {
2345 let third = self.current_char();
2346 // Check for three-character operators
2347 if matches!(
2348 (ch, next, third),
2349 ('*', '*', Some('='))
2350 | ('<', '<', Some('='))
2351 | ('>', '>', Some('='))
2352 | ('&', '&', Some('='))
2353 | ('|', '|', Some('='))
2354 | ('/', '/', Some('='))
2355 ) {
2356 self.advance(); // consume the =
2357 } else if ch == '<' && next == '=' && third == Some('>') {
2358 self.advance(); // consume the >
2359 // Special case: <=> spaceship operator
2360 }
2361 }
2362 }
2363 }
2364 }
2365 _ => return None,
2366 }
2367
2368 let text = &self.input[start..self.position];
2369 // Operator ends prototype window (e.g. `:` for attributes)
2370 self.after_sub = false;
2371 // Track whether this operator is '->' for method name disambiguation
2372 self.after_arrow = text == "->";
2373 // Any operator token ends the "just saw a variable" window; `{` after
2374 // an operator is not a hash subscript (e.g. `foo() {`, `+ {`, etc.).
2375 self.after_var_subscript = false;
2376 // Postfix ++ and -- complete a term expression, so next token is an operator
2377 // (e.g., "$x++ / 2" → / is division, not regex)
2378 if (text == "++" || text == "--") && self.mode == LexerMode::ExpectOperator {
2379 // Postfix: stay in ExpectOperator
2380 } else {
2381 self.mode = LexerMode::ExpectTerm;
2382 }
2383
2384 Some(Token {
2385 token_type: TokenType::Operator(Arc::from(text)),
2386 text: Arc::from(text),
2387 start,
2388 end: self.position,
2389 })
2390 }
2391
2392 fn try_delimiter(&mut self) -> Option<Token> {
2393 let start = self.position;
2394 let ch = self.current_char()?;
2395
2396 // If we're expecting a delimiter for a quote operator, handle it specially
2397 if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2398 // Accept any non-alphanumeric character as a delimiter
2399 if !ch.is_alphanumeric() && !ch.is_whitespace() {
2400 self.advance();
2401 if let Some(ref mut info) = self.current_quote_op {
2402 info.delimiter = ch;
2403 }
2404 // Now parse the quote operator content
2405 return self.parse_quote_operator(ch);
2406 }
2407 }
2408
2409 match ch {
2410 '(' => {
2411 // Check if this is a quote operator delimiter
2412 if matches!(self.mode, LexerMode::ExpectDelimiter)
2413 && self.current_quote_op.is_some()
2414 {
2415 self.advance();
2416 if let Some(ref mut info) = self.current_quote_op {
2417 info.delimiter = ch;
2418 }
2419 return self.parse_quote_operator(ch);
2420 }
2421
2422 self.advance();
2423 if self.after_sub {
2424 // Promote after_sub to in_prototype now that we see '('
2425 self.in_prototype = true;
2426 self.after_sub = false;
2427 self.prototype_depth = 1;
2428 } else if self.in_prototype {
2429 self.prototype_depth += 1;
2430 }
2431 self.paren_depth += 1;
2432 self.after_var_subscript = false;
2433 self.mode = LexerMode::ExpectTerm;
2434 Some(Token {
2435 token_type: TokenType::LeftParen,
2436 text: Arc::from("("),
2437 start,
2438 end: self.position,
2439 })
2440 }
2441 ')' => {
2442 self.advance();
2443 if self.in_prototype && self.prototype_depth > 0 {
2444 self.prototype_depth -= 1;
2445 if self.prototype_depth == 0 {
2446 self.in_prototype = false;
2447 }
2448 }
2449 self.after_arrow = false;
2450 self.paren_depth = self.paren_depth.saturating_sub(1);
2451 // A closing paren ends any var-subscript context: `if ($var)` should
2452 // NOT leave after_var_subscript set, otherwise the following `{` would
2453 // incorrectly increment hash_brace_depth and suppress regex operators
2454 // inside the block body (issue #2844).
2455 self.after_var_subscript = false;
2456 self.mode = LexerMode::ExpectOperator;
2457 Some(Token {
2458 token_type: TokenType::RightParen,
2459 text: Arc::from(")"),
2460 start,
2461 end: self.position,
2462 })
2463 }
2464 ';' => {
2465 self.advance();
2466 // Semicolon ends prototype window (forward declaration)
2467 self.after_sub = false;
2468 // Semicolon is a statement boundary — any pending method-call chain is over.
2469 self.after_arrow = false;
2470 self.after_var_subscript = false;
2471 self.mode = LexerMode::ExpectTerm;
2472 Some(Token {
2473 token_type: TokenType::Semicolon,
2474 text: Arc::from(";"),
2475 start,
2476 end: self.position,
2477 })
2478 }
2479 ',' => {
2480 self.advance();
2481 self.after_var_subscript = false;
2482 self.mode = LexerMode::ExpectTerm;
2483 Some(Token {
2484 token_type: TokenType::Comma,
2485 text: Arc::from(","),
2486 start,
2487 end: self.position,
2488 })
2489 }
2490 '[' => {
2491 self.advance();
2492 self.after_var_subscript = false;
2493 self.mode = LexerMode::ExpectTerm;
2494 Some(Token {
2495 token_type: TokenType::LeftBracket,
2496 text: Arc::from("["),
2497 start,
2498 end: self.position,
2499 })
2500 }
2501 ']' => {
2502 self.advance();
2503 // A closing `]` from an array subscript leaves us in a state where
2504 // a `{` immediately following is a hash subscript — e.g. `$arr[$i]{key}`.
2505 // Set after_var_subscript so the `{` handler recognises it as such.
2506 // This mirrors the `}` handler's behavior when closing a hash subscript.
2507 self.after_var_subscript = true;
2508 self.mode = LexerMode::ExpectOperator;
2509 Some(Token {
2510 token_type: TokenType::RightBracket,
2511 text: Arc::from("]"),
2512 start,
2513 end: self.position,
2514 })
2515 }
2516 '{' => {
2517 self.advance();
2518 // Opening brace ends prototype window — no prototype follows
2519 self.after_sub = false;
2520 // `{` is a hash/slice subscript opener only when it immediately follows
2521 // a variable token ($x, @x, %x) — tracked by `after_var_subscript`.
2522 // This is narrower than the old `mode == ExpectOperator` check, which
2523 // incorrectly incremented depth for block-opening braces after `sub foo`,
2524 // `if (cond)`, `else`, `while (cond)`, etc., causing quote-op suppression
2525 // inside those block bodies and breaking m//, s///, qr//, tr/// etc.
2526 if self.after_var_subscript {
2527 self.hash_brace_depth = self.hash_brace_depth.saturating_add(1);
2528 }
2529 self.after_var_subscript = false;
2530 self.mode = LexerMode::ExpectTerm;
2531 Some(Token {
2532 token_type: TokenType::LeftBrace,
2533 text: Arc::from("{"),
2534 start,
2535 end: self.position,
2536 })
2537 }
2538 '}' => {
2539 self.advance();
2540 self.after_arrow = false;
2541 // Decrement hash subscript brace depth only if we were inside one.
2542 // If depth > 0, this closes a hash subscript; enable chained subscripts
2543 // like $h{a}{b} by setting after_var_subscript so the next `{` is
2544 // recognized as another subscript opener.
2545 if self.hash_brace_depth > 0 {
2546 self.hash_brace_depth -= 1;
2547 // The subscript value is now the "variable" for a chained subscript.
2548 self.after_var_subscript = true;
2549 } else {
2550 // Block-close `}` — no subscript follows
2551 self.after_var_subscript = false;
2552 }
2553 self.mode = LexerMode::ExpectOperator;
2554 Some(Token {
2555 token_type: TokenType::RightBrace,
2556 text: Arc::from("}"),
2557 start,
2558 end: self.position,
2559 })
2560 }
2561 '#' => {
2562 // Only treat as delimiter in ExpectDelimiter mode
2563 if matches!(self.mode, LexerMode::ExpectDelimiter) {
2564 self.advance();
2565 // Reset mode after consuming delimiter
2566 self.mode = LexerMode::ExpectTerm;
2567 Some(Token {
2568 token_type: TokenType::Operator(Arc::from("#")),
2569 text: Arc::from("#"),
2570 start,
2571 end: self.position,
2572 })
2573 } else {
2574 None
2575 }
2576 }
2577 _ => None,
2578 }
2579 }
2580
2581 fn parse_double_quoted_string(&mut self, start: usize) -> Option<Token> {
2582 self.advance(); // Skip opening quote
2583 let mut parts = Vec::new();
2584 let mut current_literal = String::new();
2585 let mut last_pos = self.position;
2586
2587 while let Some(ch) = self.current_char() {
2588 match ch {
2589 '"' => {
2590 self.advance();
2591 if !current_literal.is_empty() {
2592 parts.push(StringPart::Literal(Arc::from(current_literal)));
2593 }
2594
2595 let text = &self.input[start..self.position];
2596 self.mode = LexerMode::ExpectOperator;
2597
2598 return Some(Token {
2599 token_type: if parts.is_empty() {
2600 TokenType::StringLiteral
2601 } else {
2602 TokenType::InterpolatedString(parts)
2603 },
2604 text: Arc::from(text),
2605 start,
2606 end: self.position,
2607 });
2608 }
2609 '\\' => {
2610 self.advance();
2611 if let Some(escaped) = self.current_char() {
2612 // Optimize by reserving space to avoid frequent reallocations
2613 if current_literal.capacity() == 0 {
2614 current_literal.reserve(32);
2615 }
2616 current_literal.push('\\');
2617 current_literal.push(escaped);
2618 self.advance();
2619 }
2620 }
2621 '$' if self.config.parse_interpolation => {
2622 // Handle variable interpolation - avoid unnecessary clone
2623 if !current_literal.is_empty() {
2624 parts.push(StringPart::Literal(Arc::from(current_literal)));
2625 current_literal = String::new(); // Clear without cloning
2626 }
2627
2628 let part_start = self.position;
2629 self.advance();
2630 match self.current_char() {
2631 Some('{') => {
2632 let _ = self.consume_balanced_segment_in_string('{', '}', '"');
2633 parts.push(StringPart::Expression(Arc::from(
2634 &self.input[part_start..self.position],
2635 )));
2636 }
2637 Some(ch) if is_perl_identifier_start(ch) => {
2638 let var_start = self.position;
2639
2640 // Fast path for ASCII identifier continuation
2641 while self.position < self.input_bytes.len() {
2642 let byte = self.input_bytes[self.position];
2643 if byte.is_ascii_alphanumeric() || byte == b'_' {
2644 self.position += 1;
2645 } else if byte >= 128 {
2646 // Only use UTF-8 parsing for non-ASCII
2647 if let Some(ch) = self.current_char() {
2648 if is_perl_identifier_continue(ch) {
2649 self.advance();
2650 } else {
2651 break;
2652 }
2653 } else {
2654 break;
2655 }
2656 } else {
2657 break;
2658 }
2659 }
2660
2661 if self.position > var_start {
2662 let var_name = &self.input[part_start..self.position];
2663 parts.push(StringPart::Variable(Arc::from(var_name)));
2664
2665 if self.matches_bytes(b"->") {
2666 let tail_start = self.position;
2667 self.advance();
2668 self.advance();
2669
2670 match self.current_char() {
2671 Some('[') => {
2672 let _ = self
2673 .consume_balanced_segment_in_string('[', ']', '"');
2674 parts.push(StringPart::MethodCall(Arc::from(
2675 &self.input[tail_start..self.position],
2676 )));
2677 }
2678 Some('{') => {
2679 let _ = self
2680 .consume_balanced_segment_in_string('{', '}', '"');
2681 parts.push(StringPart::MethodCall(Arc::from(
2682 &self.input[tail_start..self.position],
2683 )));
2684 }
2685 Some('(') => {
2686 let _ = self
2687 .consume_balanced_segment_in_string('(', ')', '"');
2688 parts.push(StringPart::MethodCall(Arc::from(
2689 &self.input[tail_start..self.position],
2690 )));
2691 }
2692 Some(ch) if is_perl_identifier_start(ch) => {
2693 while self.position < self.input_bytes.len() {
2694 let byte = self.input_bytes[self.position];
2695 if byte.is_ascii_alphanumeric() || byte == b'_' {
2696 self.position += 1;
2697 } else if byte >= 128 {
2698 if let Some(ch) = self.current_char() {
2699 if is_perl_identifier_continue(ch) {
2700 self.advance();
2701 } else {
2702 break;
2703 }
2704 } else {
2705 break;
2706 }
2707 } else {
2708 break;
2709 }
2710 }
2711 if self.current_char() == Some('(') {
2712 let _ = self.consume_balanced_segment_in_string(
2713 '(', ')', '"',
2714 );
2715 }
2716 parts.push(StringPart::MethodCall(Arc::from(
2717 &self.input[tail_start..self.position],
2718 )));
2719 }
2720 _ => {
2721 parts.push(StringPart::MethodCall(Arc::from(
2722 &self.input[tail_start..self.position],
2723 )));
2724 }
2725 }
2726 } else if self.current_char() == Some('[') {
2727 let tail_start = self.position;
2728 let _ = self.consume_balanced_segment_in_string('[', ']', '"');
2729 parts.push(StringPart::ArraySlice(Arc::from(
2730 &self.input[tail_start..self.position],
2731 )));
2732 } else if self.current_char() == Some('{') {
2733 let tail_start = self.position;
2734 let _ = self.consume_balanced_segment_in_string('{', '}', '"');
2735 parts.push(StringPart::Expression(Arc::from(
2736 &self.input[tail_start..self.position],
2737 )));
2738 }
2739 }
2740 }
2741 _ => {}
2742 }
2743 }
2744 _ => {
2745 // Optimize string building with better capacity management
2746 if current_literal.capacity() == 0 {
2747 current_literal.reserve(32);
2748 }
2749 current_literal.push(ch);
2750 self.advance();
2751 }
2752 }
2753
2754 // Safety check: ensure we're making progress
2755 if self.position == last_pos {
2756 break;
2757 }
2758 last_pos = self.position;
2759 }
2760
2761 Some(self.unterminated_string_error(start))
2762 }
2763
2764 fn parse_single_quoted_string(&mut self, start: usize) -> Option<Token> {
2765 self.advance(); // Skip opening quote
2766
2767 let mut last_pos = self.position;
2768
2769 while let Some(ch) = self.current_char() {
2770 match ch {
2771 '\'' => {
2772 self.advance();
2773 let text = &self.input[start..self.position];
2774 self.mode = LexerMode::ExpectOperator;
2775
2776 return Some(Token {
2777 token_type: TokenType::StringLiteral,
2778 text: Arc::from(text),
2779 start,
2780 end: self.position,
2781 });
2782 }
2783 '\\' => {
2784 self.advance();
2785 if self.current_char() == Some('\'') || self.current_char() == Some('\\') {
2786 self.advance();
2787 }
2788 }
2789 _ => self.advance(),
2790 }
2791
2792 // Safety check: ensure we're making progress
2793 if self.position == last_pos {
2794 break;
2795 }
2796 last_pos = self.position;
2797 }
2798
2799 Some(self.unterminated_string_error(start))
2800 }
2801
2802 fn parse_backtick_string(&mut self, start: usize) -> Option<Token> {
2803 self.advance(); // Skip opening backtick
2804
2805 let mut last_pos = self.position;
2806
2807 while let Some(ch) = self.current_char() {
2808 match ch {
2809 '`' => {
2810 self.advance();
2811 let text = &self.input[start..self.position];
2812 self.mode = LexerMode::ExpectOperator;
2813
2814 return Some(Token {
2815 token_type: TokenType::QuoteCommand,
2816 text: Arc::from(text),
2817 start,
2818 end: self.position,
2819 });
2820 }
2821 '\\' => {
2822 self.advance();
2823 if self.current_char().is_some() {
2824 self.advance();
2825 }
2826 }
2827 _ => self.advance(),
2828 }
2829
2830 // Safety check: ensure we're making progress
2831 if self.position == last_pos {
2832 break;
2833 }
2834 last_pos = self.position;
2835 }
2836
2837 Some(self.unterminated_string_error(start))
2838 }
2839
2840 fn parse_q_string(&mut self, _start: usize) -> Option<Token> {
2841 // Simplified q-string parsing
2842 None
2843 }
2844
2845 #[inline]
2846 fn unterminated_string_error(&mut self, start: usize) -> Token {
2847 // Consume to EOF so the caller receives a single terminal error token.
2848 let end = self.input.len();
2849 self.position = end;
2850
2851 Token {
2852 token_type: TokenType::Error(Arc::from("unterminated string")),
2853 text: Arc::from(&self.input[start..end]),
2854 start,
2855 end,
2856 }
2857 }
2858
2859 fn parse_substitution(&mut self, start: usize) -> Option<Token> {
2860 // We've already consumed 's'
2861 let delimiter = self.current_char()?;
2862 self.advance(); // Skip delimiter
2863 self.parse_substitution_with_delimiter(start, delimiter)
2864 }
2865
2866 fn parse_substitution_with_delimiter(
2867 &mut self,
2868 start: usize,
2869 delimiter: char,
2870 ) -> Option<Token> {
2871 self.read_delimited_body(delimiter);
2872
2873 let pattern_is_paired = quote_handler::paired_close(delimiter).is_some();
2874 if pattern_is_paired {
2875 while self.current_char().is_some_and(char::is_whitespace) {
2876 self.advance();
2877 }
2878
2879 if let Some(repl_delim) = self.current_char()
2880 && Self::is_quote_delim(repl_delim)
2881 {
2882 self.advance();
2883 self.read_delimited_body(repl_delim);
2884 }
2885 } else {
2886 self.read_delimited_body(delimiter);
2887 }
2888
2889 // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
2890 while let Some(ch) = self.current_char() {
2891 if ch.is_ascii_alphanumeric() {
2892 self.advance();
2893 } else {
2894 break;
2895 }
2896 }
2897
2898 let text = &self.input[start..self.position];
2899 self.mode = LexerMode::ExpectOperator;
2900
2901 Some(Token {
2902 token_type: TokenType::Substitution,
2903 text: Arc::from(text),
2904 start,
2905 end: self.position,
2906 })
2907 }
2908
2909 fn parse_transliteration(&mut self, start: usize) -> Option<Token> {
2910 // We've already consumed 'tr' or 'y'
2911 while self.current_char().is_some_and(char::is_whitespace) {
2912 self.advance();
2913 }
2914
2915 let delimiter = self.current_char()?;
2916 self.advance(); // Skip delimiter
2917 self.parse_transliteration_with_delimiter(start, delimiter)
2918 }
2919
2920 fn parse_transliteration_with_delimiter(
2921 &mut self,
2922 start: usize,
2923 delimiter: char,
2924 ) -> Option<Token> {
2925 self.read_delimited_body(delimiter);
2926
2927 let search_is_paired = quote_handler::paired_close(delimiter).is_some();
2928 if search_is_paired {
2929 while self.current_char().is_some_and(char::is_whitespace) {
2930 self.advance();
2931 }
2932
2933 if let Some(repl_delim) = self.current_char()
2934 && Self::is_quote_delim(repl_delim)
2935 {
2936 self.advance();
2937 self.read_delimited_body(repl_delim);
2938 }
2939 } else {
2940 self.read_delimited_body(delimiter);
2941 }
2942
2943 // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
2944 while let Some(ch) = self.current_char() {
2945 if ch.is_ascii_alphanumeric() {
2946 self.advance();
2947 } else {
2948 break;
2949 }
2950 }
2951
2952 let text = &self.input[start..self.position];
2953 self.mode = LexerMode::ExpectOperator;
2954
2955 Some(Token {
2956 token_type: TokenType::Transliteration,
2957 text: Arc::from(text),
2958 start,
2959 end: self.position,
2960 })
2961 }
2962
2963 /// Read content between delimiters.
2964 ///
2965 /// Returns `(body, closed)` where `closed` is `true` if the closing
2966 /// delimiter was found before EOF, and `false` if EOF was reached first.
2967 fn read_delimited_body(&mut self, delim: char) -> (String, bool) {
2968 let paired = quote_handler::paired_close(delim);
2969 let close = paired.unwrap_or(delim);
2970 let mut body = String::new();
2971 let mut depth = i32::from(paired.is_some());
2972
2973 while let Some(ch) = self.current_char() {
2974 if ch == '\\' {
2975 body.push(ch);
2976 self.advance();
2977 if let Some(next) = self.current_char() {
2978 body.push(next);
2979 self.advance();
2980 }
2981 continue;
2982 }
2983
2984 if paired.is_some() && ch == delim {
2985 body.push(ch);
2986 self.advance();
2987 depth += 1;
2988 continue;
2989 }
2990
2991 if ch == close {
2992 if paired.is_some() {
2993 depth -= 1;
2994 if depth == 0 {
2995 self.advance();
2996 return (body, true);
2997 }
2998 body.push(ch);
2999 self.advance();
3000 } else {
3001 self.advance();
3002 return (body, true);
3003 }
3004 continue;
3005 }
3006
3007 body.push(ch);
3008 self.advance();
3009 }
3010
3011 // EOF reached without finding the closing delimiter
3012 (body, false)
3013 }
3014
3015 /// Parse a quote operator after we've seen the delimiter
3016 fn parse_quote_operator(&mut self, delimiter: char) -> Option<Token> {
3017 let info = self.current_quote_op.as_ref()?;
3018 let start = info.start_pos;
3019 let operator = info.operator.clone();
3020
3021 // Clear the quote-op context eagerly so any early-return path (s/tr/y delegations
3022 // below) does not leave a stale reference behind. The post-match cleanup at the
3023 // bottom of this function would otherwise be skipped for those operators.
3024 self.current_quote_op = None;
3025
3026 // Parse based on operator type; track whether all delimiters were closed.
3027 let closed = match operator.as_str() {
3028 "s" => {
3029 return self.parse_substitution_with_delimiter(start, delimiter);
3030 }
3031 "tr" | "y" => {
3032 return self.parse_transliteration_with_delimiter(start, delimiter);
3033 }
3034 "qr" => {
3035 let (_pattern, body_closed) = self.read_delimited_body(delimiter);
3036 self.parse_regex_modifiers("e_handler::QR_SPEC);
3037 body_closed
3038 }
3039 "m" => {
3040 let (_pattern, body_closed) = self.read_delimited_body(delimiter);
3041 self.parse_regex_modifiers("e_handler::M_SPEC);
3042 body_closed
3043 }
3044 _ => {
3045 // q, qq, qw, qx - no modifiers
3046 let (_body, body_closed) = self.read_delimited_body(delimiter);
3047 body_closed
3048 }
3049 };
3050
3051 let text = &self.input[start..self.position];
3052
3053 self.mode = LexerMode::ExpectOperator;
3054
3055 if !closed {
3056 // EOF reached before finding the closing delimiter — emit an error
3057 // token so the parser's recovery mechanism records a diagnostic.
3058 return Some(Token {
3059 token_type: TokenType::Error(Arc::from(format!(
3060 "unclosed {} delimiter '{}'",
3061 operator, delimiter
3062 ))),
3063 text: Arc::from(text),
3064 start,
3065 end: self.position,
3066 });
3067 }
3068
3069 let token_type = quote_handler::get_quote_token_type(&operator);
3070 Some(Token { token_type, text: Arc::from(text), start, end: self.position })
3071 }
3072
3073 /// Parse regex modifiers according to the given spec
3074 ///
3075 /// This function includes ALL characters that could be intended as modifiers,
3076 /// including invalid ones. This allows the parser to properly reject invalid
3077 /// modifiers with a clear error message, rather than leaving them as separate
3078 /// tokens that could be confusingly parsed.
3079 fn parse_regex_modifiers(&mut self, _spec: "e_handler::ModSpec) {
3080 // Consume all alphanumeric characters that could be intended as modifiers
3081 // The parser will validate and reject invalid ones
3082 while let Some(ch) = self.current_char() {
3083 if ch.is_ascii_alphanumeric() {
3084 self.advance();
3085 } else {
3086 break;
3087 }
3088 }
3089 // Note: We no longer validate here - the parser will validate and provide
3090 // clear error messages for invalid modifiers (MUT_005 fix)
3091 }
3092
3093 /// Parse a regex literal starting with `/`
3094 ///
3095 /// **Budget Protection (Issue #422)**:
3096 /// - Budget guards prevent runaway scanning on pathological input
3097 /// - `MAX_REGEX_PARSE_STEPS` bounds literal scanning before the byte budget
3098 /// - `MAX_REGEX_BYTES` bounds total bytes consumed in a single regex literal
3099 /// - Graceful degradation: emit UnknownRest token if budget exceeded
3100 ///
3101 /// **Performance**:
3102 /// - Single-pass scanning with escape handling
3103 /// - Budget check per iteration (amortized O(1) via inline fast path)
3104 /// - Typical regex: <10μs, Large regex (64KB): ~1ms
3105 fn parse_regex(&mut self, start: usize) -> Option<Token> {
3106 self.advance(); // Skip opening /
3107
3108 let mut regex_parse_steps: usize = 0;
3109 let mut in_character_class = false;
3110
3111 while let Some(ch) = self.current_char() {
3112 regex_parse_steps += 1;
3113 if regex_parse_steps > MAX_REGEX_PARSE_STEPS {
3114 #[cfg(debug_assertions)]
3115 {
3116 let text = &self.input[start..self.position];
3117 let preview = truncate_preview(text, 50);
3118 tracing::debug!(
3119 limit = MAX_REGEX_PARSE_STEPS,
3120 pattern_preview = %preview,
3121 "Regex parse step budget exceeded"
3122 );
3123 }
3124 self.position = self.input.len();
3125 return Some(Token {
3126 token_type: TokenType::UnknownRest,
3127 text: empty_arc(),
3128 start,
3129 end: self.position,
3130 });
3131 }
3132
3133 // Budget guard: prevent timeout on pathological input (Issue #422)
3134 // If exceeded, returns UnknownRest token for graceful degradation
3135 if let Some(token) = self.budget_guard(start, 0) {
3136 return Some(token);
3137 }
3138
3139 match ch {
3140 '/' if !in_character_class => {
3141 self.advance();
3142 // Parse flags - include all alphanumeric for proper validation in parser (MUT_005 fix)
3143 while let Some(ch) = self.current_char() {
3144 if ch.is_ascii_alphanumeric() {
3145 self.advance();
3146 } else {
3147 break;
3148 }
3149 }
3150
3151 let text = &self.input[start..self.position];
3152 self.mode = LexerMode::ExpectOperator;
3153
3154 return Some(Token {
3155 token_type: TokenType::RegexMatch,
3156 text: Arc::from(text),
3157 start,
3158 end: self.position,
3159 });
3160 }
3161 '\\' => {
3162 // Handle escape sequences: consume backslash + next char
3163 self.advance();
3164 if self.current_char().is_some() {
3165 self.advance();
3166 }
3167 }
3168 '[' => {
3169 in_character_class = true;
3170 self.advance();
3171 }
3172 ']' if in_character_class => {
3173 in_character_class = false;
3174 self.advance();
3175 }
3176 _ => self.advance(),
3177 }
3178 }
3179
3180 // Unterminated regex - EOF reached before closing /
3181 // Parser will emit diagnostic for unterminated literal
3182 None
3183 }
3184}
3185
3186// Pre-allocated empty Arc to avoid repeated allocations
3187static EMPTY_ARC: OnceLock<Arc<str>> = OnceLock::new();
3188
3189#[inline(always)]
3190fn empty_arc() -> Arc<str> {
3191 EMPTY_ARC.get_or_init(|| Arc::from("")).clone()
3192}
3193
3194fn truncate_preview(text: &str, max_chars: usize) -> String {
3195 match text.char_indices().nth(max_chars) {
3196 Some((idx, _)) => format!("{}...", &text[..idx]),
3197 None => text.to_string(),
3198 }
3199}
3200
3201#[inline(always)]
3202fn is_keyword_fast(word: &str) -> bool {
3203 // Fast length-based rejection for most cases.
3204 // Lexer keywords are currently bounded to 1..=9 characters.
3205 matches!(word.len(), 1..=9) && is_lexer_keyword(word)
3206}
3207
3208#[inline]
3209fn is_builtin_function(word: &str) -> bool {
3210 BARE_TERM_BUILTINS.binary_search(&word).is_ok()
3211}
3212
3213#[inline(always)]
3214fn is_quote_op_word_prefix(word: &[u8]) -> bool {
3215 matches!(word, b"m" | b"q" | b"qq" | b"qw" | b"qx" | b"qr")
3216}
3217
3218const BARE_TERM_BUILTINS: &[&str] = &[
3219 "abs", "chomp", "chop", "chr", "close", "defined", "delete", "each", "exists", "hex", "int",
3220 "join", "keys", "lc", "lcfirst", "length", "oct", "open", "ord", "pack", "print", "push",
3221 "read", "ref", "reverse", "rindex", "say", "scalar", "splice", "sprintf", "sqrt", "substr",
3222 "tie", "uc", "ucfirst", "unpack", "unshift", "untie", "values", "write",
3223];
3224
3225/// Fast lookup table for compound operator second characters
3226const COMPOUND_SECOND_CHARS: &[u8] = b"=<>&|+->.~*:";
3227
3228#[inline]
3229fn is_compound_operator(first: char, second: char) -> bool {
3230 // Optimized compound operator lookup using perfect hashing for common cases
3231 // Convert to bytes for faster comparison (most operators are ASCII)
3232 if first.is_ascii() && second.is_ascii() {
3233 let first_byte = first as u8;
3234 let second_byte = second as u8;
3235
3236 if !COMPOUND_SECOND_CHARS.contains(&second_byte) {
3237 return false;
3238 }
3239
3240 // Use lookup table approach for maximum performance
3241 match (first_byte, second_byte) {
3242 // Assignment operators
3243 (b'+' | b'-' | b'*' | b'/' | b'%' | b'&' | b'|' | b'^' | b'.', b'=') => true,
3244
3245 // Comparison operators
3246 (b'<' | b'>' | b'=' | b'!', b'=') => true,
3247
3248 // Pattern operators
3249 (b'=' | b'!', b'~') => true,
3250
3251 // Increment/decrement
3252 (b'+', b'+') | (b'-', b'-') => true,
3253
3254 // Logical operators
3255 (b'&', b'&') | (b'|', b'|') => true,
3256
3257 // Shift operators
3258 (b'<', b'<') | (b'>', b'>') => true,
3259
3260 // Other compound operators
3261 (b'*', b'*')
3262 | (b'/', b'/')
3263 | (b'-' | b'=', b'>')
3264 | (b'.', b'.')
3265 | (b'~', b'~')
3266 | (b':', b':') => true,
3267
3268 _ => false,
3269 }
3270 } else {
3271 // Fallback for non-ASCII (should be rare)
3272 matches!(
3273 (first, second),
3274 ('+' | '-' | '*' | '/' | '%' | '&' | '|' | '^' | '.' | '<' | '>' | '=' | '!', '=')
3275 | ('=' | '!' | '~', '~')
3276 | ('+', '+')
3277 | ('-', '-' | '>')
3278 | ('&', '&')
3279 | ('|', '|')
3280 | ('<', '<')
3281 | ('>' | '=', '>')
3282 | ('*', '*')
3283 | ('/', '/')
3284 | ('.', '.')
3285 | (':', ':')
3286 )
3287 }
3288}
3289
3290// Checkpoint support for incremental parsing
3291
3292mod checkpoint_impl;
3293
3294#[cfg(test)]
3295mod test_format_debug;
3296#[cfg(test)]
3297mod tests;