perl_lexer/lib.rs
1//! Context-aware Perl lexer with mode-based tokenization
2//!
3//! This crate provides a high-performance lexer for Perl that handles the inherently
4//! context-sensitive nature of the language. The lexer uses a mode-tracking system to
5//! correctly disambiguate ambiguous syntax like `/` (division vs. regex) and properly
6//! parse complex constructs like heredocs, quote-like operators, and nested delimiters.
7//!
8//! # Architecture
9//!
10//! The lexer is organized around several key concepts:
11//!
12//! - **Mode Tracking**: [`LexerMode`] tracks whether the parser expects a term or an operator,
13//! enabling correct disambiguation of context-sensitive tokens.
14//! - **Checkpointing**: [`LexerCheckpoint`] and [`Checkpointable`] support incremental parsing
15//! by allowing the lexer state to be saved and restored.
16//! - **Budget Limits**: Protection against pathological input with configurable size limits
17//! for regex patterns, heredoc bodies, and delimiter nesting depth.
18//! - **Position Tracking**: [`Position`] maintains line/column information for error reporting
19//! and LSP integration.
20//! - **Unicode Support**: Full Unicode identifier support following Perl 5.14+ semantics.
21//!
22//! # Usage
23//!
24//! ## Basic Tokenization
25//!
26//! ```rust
27//! use perl_lexer::{PerlLexer, TokenType};
28//!
29//! let mut lexer = PerlLexer::new("my $x = 42;");
30//! let tokens = lexer.collect_tokens();
31//!
32//! // First token is the keyword `my`
33//! assert!(matches!(&tokens[0].token_type, TokenType::Keyword(k) if &**k == "my"));
34//! // Tokens include variables, operators, literals, and EOF
35//! assert!(matches!(&tokens.last().map(|t| &t.token_type), Some(TokenType::EOF)));
36//! ```
37//!
38//! ## Context-Aware Parsing
39//!
40//! The lexer automatically tracks context to disambiguate operators:
41//!
42//! ```rust
43//! use perl_lexer::{PerlLexer, TokenType};
44//!
45//! // Division operator (after a term)
46//! let mut lexer = PerlLexer::new("42 / 2");
47//! // Regex operator (at start of expression)
48//! let mut lexer2 = PerlLexer::new("/pattern/");
49//! ```
50//!
51//! ## Checkpointing for Incremental Parsing
52//!
53//! ```rust,ignore
54//! use perl_lexer::{PerlLexer, Checkpointable};
55//!
56//! let mut lexer = PerlLexer::new("my $x = 1;");
57//! let checkpoint = lexer.checkpoint();
58//!
59//! // Parse some tokens
60//! let _ = lexer.next_token();
61//!
62//! // Restore to checkpoint
63//! lexer.restore(&checkpoint);
64//! ```
65//!
66//! ## Configuration Options
67//!
68//! ```rust
69//! use perl_lexer::{PerlLexer, LexerConfig};
70//!
71//! let config = LexerConfig {
72//! parse_interpolation: true, // Parse string interpolation
73//! track_positions: true, // Track line/column positions
74//! max_lookahead: 1024, // Maximum lookahead for disambiguation
75//! };
76//!
77//! let mut lexer = PerlLexer::with_config("my $x = 1;", config);
78//! ```
79//!
80//! # Context Sensitivity Examples
81//!
82//! Perl's grammar is highly context-sensitive. The lexer handles these cases:
83//!
84//! - **Division vs. Regex**: `/` is division after terms, regex at expression start
85//! - **Modulo vs. Hash Sigil**: `%` is modulo after terms, hash sigil at expression start
86//! - **Glob vs. Exponent**: `**` can be exponentiation or glob pattern start
87//! - **Defined-or vs. Regex**: `//` is defined-or after terms, regex at expression start
88//! - **Heredoc Markers**: `<<` can be left shift, here-doc, or numeric less-than-less-than
89//!
90//! # Budget Limits
91//!
92//! To prevent hangs on pathological input, the lexer enforces these limits:
93//!
94//! - **MAX_REGEX_BYTES**: 64KB maximum for regex patterns
95//! - **MAX_HEREDOC_BYTES**: 256KB maximum for heredoc bodies
96//! - **MAX_DELIM_NEST**: 128 levels maximum nesting depth for delimiters
97//! - **MAX_REGEX_PARSE_STEPS**: 32K maximum scan iterations for regex literals
98//!
99//! When limits are exceeded, the lexer emits an `UnknownRest` token preserving
100//! all previously parsed symbols, allowing continued analysis.
101//!
102//! # Integration with perl-parser
103//!
104//! The lexer is designed to work seamlessly with `perl_parser_core::Parser`.
105//! You rarely need to use the lexer directly -- the parser creates and manages
106//! a `PerlLexer` instance internally:
107//!
108//! ```rust,ignore
109//! use perl_parser_core::Parser;
110//!
111//! let code = r#"sub hello { print "Hello, world!\n"; }"#;
112//! let mut parser = Parser::new(code);
113//! let ast = parser.parse().expect("should parse");
114//! ```
115
116#![allow(
117 // Core allows for lexer code
118 clippy::too_many_lines,
119 clippy::module_name_repetitions,
120 clippy::cast_possible_truncation,
121 clippy::cast_sign_loss,
122 clippy::cast_possible_wrap,
123 clippy::cast_precision_loss,
124 clippy::must_use_candidate,
125 clippy::missing_errors_doc,
126 clippy::missing_panics_doc,
127
128 // Lexer-specific patterns that are fine
129 clippy::match_same_arms,
130 clippy::redundant_else,
131 clippy::unnecessary_wraps,
132 clippy::unused_self,
133 clippy::items_after_statements,
134 clippy::struct_excessive_bools,
135 clippy::uninlined_format_args
136)]
137
138use std::sync::{Arc, OnceLock};
139
140pub mod api;
141pub mod builtins;
142pub mod checkpoint;
143pub mod config;
144pub mod error;
145mod heredoc;
146pub mod keywords;
147mod lexer;
148pub mod limits;
149pub mod mode;
150mod quote_handler;
151pub mod token;
152pub mod tokenizer;
153mod unicode;
154
155pub use api::*;
156pub use checkpoint::{CheckpointCache, Checkpointable, LexerCheckpoint};
157pub use config::LexerConfig;
158pub use error::{LexerError, Result};
159pub use lexer::PerlLexer;
160pub use limits::MAX_REGEX_PARSE_STEPS;
161pub use mode::LexerMode;
162pub use perl_position_tracking::Position;
163pub use token::{StringPart, Token, TokenType};
164
165use unicode::{is_perl_identifier_continue, is_perl_identifier_start};
166
167use crate::heredoc::HeredocSpec;
168use crate::limits::{
169 HEREDOC_TIMEOUT_MS, MAX_DELIM_NEST, MAX_HEREDOC_BYTES, MAX_HEREDOC_DEPTH, MAX_REGEX_BYTES,
170};
171
172impl<'a> PerlLexer<'a> {
173 /// Create a new lexer that emits `HeredocBody` tokens (for LSP folding)
174 pub fn with_body_tokens(input: &'a str) -> Self {
175 let mut lexer = Self::new(input);
176 lexer.emit_heredoc_body_tokens = true;
177 lexer
178 }
179
180 /// Set the lexer mode (for resetting state at statement boundaries)
181 pub fn set_mode(&mut self, mode: LexerMode) {
182 self.mode = mode;
183 }
184
185 /// Advance the lexer and return the next token.
186 ///
187 /// Returns `None` only after an `EOF` token has already been emitted.
188 /// The final meaningful call returns `Some(Token { token_type: TokenType::EOF, .. })`.
189 pub fn next_token(&mut self) -> Option<Token> {
190 // Normalize file start (BOM) once
191 if self.position == 0 {
192 self.normalize_file_start();
193 }
194
195 // Loop to avoid recursion when processing heredocs
196 loop {
197 // Handle format body parsing if we're in that mode
198 if matches!(self.mode, LexerMode::InFormatBody) {
199 return self.parse_format_body();
200 }
201
202 // Handle data section parsing if we're in that mode
203 if matches!(self.mode, LexerMode::InDataSection) {
204 return self.parse_data_body();
205 }
206
207 // Check if we're inside a heredoc body BEFORE skipping whitespace
208 let mut found_terminator = false;
209 if !self.pending_heredocs.is_empty() {
210 // Clone what we need to avoid holding a borrow
211 let (body_start, label, allow_indent) =
212 if let Some(spec) = self.pending_heredocs.first() {
213 if spec.body_start > 0
214 && self.position >= spec.body_start
215 && self.position < self.input.len()
216 {
217 (spec.body_start, spec.label.clone(), spec.allow_indent)
218 } else {
219 // Not in a heredoc body yet or at EOF
220 (0, empty_arc(), false)
221 }
222 } else {
223 (0, empty_arc(), false)
224 };
225
226 if body_start > 0 {
227 // We're inside a heredoc body - scan for the terminator
228
229 // Scan line by line looking for the terminator
230 while self.position < self.input.len() {
231 // Timeout protection (Issue #443)
232 if self.start_time.elapsed().as_millis() > HEREDOC_TIMEOUT_MS as u128 {
233 self.pending_heredocs.remove(0);
234 self.position = self.input.len();
235 return Some(Token {
236 token_type: TokenType::Error(Arc::from("Heredoc parsing timeout")),
237 text: Arc::from(&self.input[body_start..]),
238 start: body_start,
239 end: self.input.len(),
240 });
241 }
242
243 // Budget cap for huge bodies - optimized check
244 if self.position - body_start > MAX_HEREDOC_BYTES {
245 // Remove the pending heredoc to avoid infinite loop
246 self.pending_heredocs.remove(0);
247 self.position = self.input.len();
248 return Some(Token {
249 token_type: TokenType::UnknownRest,
250 text: Arc::from(&self.input[body_start..]),
251 start: body_start,
252 end: self.input.len(),
253 });
254 }
255
256 // Skip to start of next line if not at line start
257 // Exception: if we're at body_start exactly, we're at the heredoc body start
258 if !self.after_newline && self.position != body_start {
259 while self.position < self.input.len()
260 && self.input_bytes[self.position] != b'\n'
261 && self.input_bytes[self.position] != b'\r'
262 {
263 self.advance();
264 }
265 self.consume_newline();
266 continue;
267 }
268
269 // We're at line start - check if this line is the terminator
270 let line_start = self.position;
271 let (line_end, line_visible_end) =
272 Self::find_line_end(self.input_bytes, self.position);
273 let line = &self.input[line_start..line_visible_end];
274 // Strip trailing spaces/tabs (Perl allows them)
275 let trimmed_end = line.trim_end_matches([' ', '\t']);
276
277 // Check if this line is the terminator
278 let is_terminator = if allow_indent {
279 // Allow any leading spaces/tabs before the label
280 let mut p = 0;
281 while p < trimmed_end.len() {
282 let b = trimmed_end.as_bytes()[p];
283 if b == b' ' || b == b'\t' {
284 p += 1;
285 } else {
286 break;
287 }
288 }
289 trimmed_end[p..] == *label
290 } else {
291 // Must start at column 0 (no leading whitespace)
292 // The terminator is just the label (already trimmed trailing whitespace)
293 trimmed_end == &*label
294 };
295
296 if is_terminator {
297 // Found the terminator!
298 self.pending_heredocs.remove(0);
299 found_terminator = true;
300
301 // Consume past the terminator line
302 self.position = line_end;
303 self.consume_newline();
304
305 // Set body_start for the next pending heredoc (if any)
306 if let Some(next) = self.pending_heredocs.first_mut()
307 && next.body_start == 0
308 {
309 next.body_start = self.position;
310 }
311
312 // Only emit HeredocBody if requested (for folding)
313 if self.emit_heredoc_body_tokens {
314 return Some(Token {
315 token_type: TokenType::HeredocBody(empty_arc()),
316 text: empty_arc(),
317 start: body_start,
318 end: line_start,
319 });
320 }
321 // Otherwise, continue the outer loop to get the next real token (avoiding recursion)
322 break; // Break inner while loop, continue outer loop
323 }
324
325 // Not the terminator, continue to next line
326 self.position = line_end;
327 self.consume_newline();
328 }
329
330 // If we didn't find a terminator, we reached EOF - emit error token
331 if !found_terminator {
332 // Remove the pending heredoc to avoid infinite loop
333 self.pending_heredocs.remove(0);
334 self.position = self.input.len();
335 return Some(Token {
336 token_type: TokenType::UnknownRest,
337 text: Arc::from(&self.input[body_start..]),
338 start: body_start,
339 end: self.input.len(),
340 });
341 }
342 }
343
344 // If we found a terminator, continue outer loop to get next token
345 if found_terminator {
346 continue; // Continue outer loop to get next token
347 }
348 }
349
350 self.skip_whitespace_and_comments()?;
351
352 // Check again if we're now in a heredoc body (might have been set during skip_whitespace)
353 if !self.pending_heredocs.is_empty()
354 && let Some(spec) = self.pending_heredocs.first()
355 && spec.body_start > 0
356 && self.position >= spec.body_start
357 && self.position < self.input.len()
358 {
359 continue; // Go back to top of loop to process heredoc
360 }
361
362 // If we reach EOF with pending heredocs, clear them and emit EOF
363 if self.position >= self.input.len() && !self.pending_heredocs.is_empty() {
364 self.pending_heredocs.clear();
365 }
366
367 if self.position >= self.input.len() {
368 if self.eof_emitted {
369 return None; // Stop the stream
370 }
371 self.eof_emitted = true;
372 return Some(Token {
373 token_type: TokenType::EOF,
374 text: empty_arc(),
375 start: self.position,
376 end: self.position,
377 });
378 }
379
380 let start = self.position;
381
382 // Check for special tokens first
383 if let Some(token) = self.try_heredoc() {
384 return Some(token);
385 }
386
387 if let Some(token) = self.try_string() {
388 return Some(token);
389 }
390
391 if let Some(token) = self.try_variable() {
392 return Some(token);
393 }
394
395 if let Some(token) = self.try_number() {
396 return Some(token);
397 }
398
399 if let Some(token) = self.try_vstring() {
400 return Some(token);
401 }
402
403 if let Some(token) = self.try_identifier_or_keyword() {
404 return Some(token);
405 }
406
407 // If we're expecting a delimiter for a quote operator, only try delimiter
408 if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
409 if let Some(token) = self.try_delimiter() {
410 return Some(token);
411 }
412 // Do NOT fall through to try_operator / try_punct / etc.
413 // Clear state first so we don't spin
414 self.mode = LexerMode::ExpectOperator;
415 self.current_quote_op = None;
416 continue;
417 }
418
419 if let Some(token) = self.try_operator() {
420 return Some(token);
421 }
422
423 if let Some(token) = self.try_delimiter() {
424 return Some(token);
425 }
426
427 // If nothing else matches, return an error token
428 let ch = self.current_char()?;
429 self.advance();
430
431 // Optimize error token creation - avoid expensive formatting in hot path
432 let text = if ch.is_ascii() {
433 // Fast path for ASCII characters
434 Arc::from(&self.input[start..self.position])
435 } else {
436 // Unicode path without intermediate heap allocation
437 let mut buf = [0_u8; 4];
438 Arc::from(ch.encode_utf8(&mut buf))
439 };
440
441 return Some(Token {
442 token_type: TokenType::Error(Arc::from("Unexpected character")),
443 text,
444 start,
445 end: self.position,
446 });
447 } // End of loop
448 }
449
450 /// Budget guard to prevent infinite loops and timeouts (Issue #422)
451 ///
452 /// **Purpose**: Protect against pathological input that could cause:
453 /// - Infinite loops in regex/heredoc parsing
454 /// - Excessive memory consumption
455 /// - LSP server hangs
456 ///
457 /// **Limits**:
458 /// - `MAX_REGEX_BYTES` (64KB): Maximum bytes in a single regex literal
459 /// - `MAX_DELIM_NEST` (128): Maximum delimiter nesting depth
460 ///
461 /// **Graceful Degradation**:
462 /// - Budget exceeded → emit `UnknownRest` token
463 /// - Jump to EOF to prevent further parsing of problematic region
464 /// - LSP client can emit soft diagnostic about truncation
465 /// - All previously parsed symbols remain valid
466 ///
467 /// **Performance**:
468 /// - Fast path: inlined subtraction + comparison (~1-2 CPU cycles)
469 /// - Slow path: Only triggered on pathological input
470 /// - Amortized cost: O(1) per token
471 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
472 #[inline(always)]
473 fn budget_guard(&mut self, start: usize, depth: usize) -> Option<Token> {
474 // Fast path: most calls won't hit limits
475 let bytes_consumed = self.position - start;
476 if bytes_consumed <= MAX_REGEX_BYTES && depth <= MAX_DELIM_NEST {
477 return None;
478 }
479
480 // Slow path: budget exceeded - graceful degradation
481 #[cfg(debug_assertions)]
482 {
483 tracing::debug!(
484 bytes_consumed,
485 depth,
486 position = self.position,
487 "Lexer budget exceeded"
488 );
489 }
490
491 self.position = self.input.len();
492 Some(Token {
493 token_type: TokenType::UnknownRest,
494 text: Arc::from(""),
495 start,
496 end: self.position,
497 })
498 }
499
500 /// Peek at the next token without consuming it.
501 ///
502 /// Saves and restores the full lexer state so the next call to
503 /// [`next_token`](Self::next_token) returns the same token.
504 pub fn peek_token(&mut self) -> Option<Token> {
505 let saved_pos = self.position;
506 let saved_mode = self.mode;
507 let saved_delimiter_stack = self.delimiter_stack.clone();
508 let saved_prototype = self.in_prototype;
509 let saved_depth = self.prototype_depth;
510 let saved_after_sub = self.after_sub;
511 let saved_after_arrow = self.after_arrow;
512 let saved_hash_brace_depth = self.hash_brace_depth;
513 let saved_after_var_subscript = self.after_var_subscript;
514 let saved_paren_depth = self.paren_depth;
515 let saved_current_pos = self.current_pos;
516 let saved_after_newline = self.after_newline;
517 let saved_pending_heredocs = self.pending_heredocs.clone();
518 let saved_line_start_offset = self.line_start_offset;
519 let saved_current_quote_op = self.current_quote_op.clone();
520 let saved_eof_emitted = self.eof_emitted;
521 let saved_start_time = self.start_time;
522
523 let token = self.next_token();
524
525 self.position = saved_pos;
526 self.mode = saved_mode;
527 self.delimiter_stack = saved_delimiter_stack;
528 self.in_prototype = saved_prototype;
529 self.prototype_depth = saved_depth;
530 self.after_sub = saved_after_sub;
531 self.after_arrow = saved_after_arrow;
532 self.hash_brace_depth = saved_hash_brace_depth;
533 self.after_var_subscript = saved_after_var_subscript;
534 self.paren_depth = saved_paren_depth;
535 self.current_pos = saved_current_pos;
536 self.after_newline = saved_after_newline;
537 self.pending_heredocs = saved_pending_heredocs;
538 self.line_start_offset = saved_line_start_offset;
539 self.current_quote_op = saved_current_quote_op;
540 self.eof_emitted = saved_eof_emitted;
541 self.start_time = saved_start_time;
542
543 token
544 }
545
546 /// Consume all remaining tokens and return them as a vector.
547 ///
548 /// The returned vector always ends with an `EOF` token.
549 pub fn collect_tokens(&mut self) -> Vec<Token> {
550 let mut tokens = Vec::new();
551 while let Some(token) = self.next_token() {
552 if token.token_type == TokenType::EOF {
553 tokens.push(token);
554 break;
555 }
556 tokens.push(token);
557 }
558 tokens
559 }
560
561 /// Reset the lexer to the beginning of the input.
562 ///
563 /// Clears all internal state (mode, delimiter stack, heredoc queue, etc.)
564 /// so the lexer can re-tokenize the same source from scratch.
565 pub fn reset(&mut self) {
566 self.position = 0;
567 self.mode = LexerMode::ExpectTerm;
568 self.delimiter_stack.clear();
569 self.in_prototype = false;
570 self.prototype_depth = 0;
571 self.after_sub = false;
572 self.after_arrow = false;
573 self.hash_brace_depth = 0;
574 self.after_var_subscript = false;
575 self.paren_depth = 0;
576 self.current_pos = Position::start();
577 self.after_newline = true;
578 self.pending_heredocs.clear();
579 self.line_start_offset = 0;
580 self.current_quote_op = None;
581 self.eof_emitted = false;
582 self.start_time = std::time::Instant::now();
583 }
584
585 /// Switch the lexer into format-body parsing mode.
586 ///
587 /// In this mode the lexer consumes input verbatim until it encounters a
588 /// line containing only `.` (the Perl format terminator).
589 pub fn enter_format_mode(&mut self) {
590 self.mode = LexerMode::InFormatBody;
591 }
592
593 // Internal helper methods
594
595 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
596 #[inline(always)]
597 fn byte_at(bytes: &[u8], index: usize) -> u8 {
598 debug_assert!(index < bytes.len());
599 match bytes.get(index) {
600 Some(&byte) => byte,
601 None => 0,
602 }
603 }
604
605 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
606 #[inline(always)]
607 fn current_char(&self) -> Option<char> {
608 if self.position < self.input_bytes.len() {
609 // For ASCII, direct access is safe
610 let byte = Self::byte_at(self.input_bytes, self.position);
611 if byte < 128 {
612 Some(byte as char)
613 } else {
614 // For non-ASCII, fall back to proper UTF-8 parsing
615 self.input.get(self.position..).and_then(|s| s.chars().next())
616 }
617 } else {
618 None
619 }
620 }
621
622 #[inline(always)]
623 fn peek_char(&self, offset: usize) -> Option<char> {
624 if offset > self.config.max_lookahead {
625 return None;
626 }
627
628 let pos = self.position.checked_add(offset)?;
629 if pos < self.input_bytes.len() {
630 // For ASCII, direct access is safe
631 let byte = Self::byte_at(self.input_bytes, pos);
632 if byte < 128 {
633 Some(byte as char)
634 } else {
635 // For non-ASCII, use chars iterator
636 self.input.get(self.position..).and_then(|s| s.chars().nth(offset))
637 }
638 } else {
639 None
640 }
641 }
642
643 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
644 #[inline(always)]
645 fn advance(&mut self) {
646 if self.position < self.input_bytes.len() {
647 let byte = Self::byte_at(self.input_bytes, self.position);
648 if byte < 128 {
649 // ASCII fast path
650 self.position += 1;
651 } else if let Some(ch) = self.input.get(self.position..).and_then(|s| s.chars().next())
652 {
653 self.position += ch.len_utf8();
654 }
655 }
656 }
657
658 /// General-purpose balanced-segment consumer (no quote-boundary recovery).
659 ///
660 /// For use inside double-quoted string interpolation where the outer `"` must
661 /// act as a recovery boundary, use [`consume_balanced_segment_in_string`] instead.
662 #[allow(dead_code)]
663 #[inline]
664 fn consume_balanced_segment(&mut self, open: char, close: char) -> Option<usize> {
665 if self.current_char() != Some(open) {
666 return None;
667 }
668
669 let mut depth = 1usize;
670 self.advance();
671 while let Some(ch) = self.current_char() {
672 match ch {
673 '\\' => {
674 self.advance();
675 if self.current_char().is_some() {
676 self.advance();
677 }
678 }
679 c if c == open => {
680 depth += 1;
681 self.advance();
682 }
683 c if c == close => {
684 self.advance();
685 depth -= 1;
686 if depth == 0 {
687 return Some(self.position);
688 }
689 }
690 _ => self.advance(),
691 }
692 }
693
694 None
695 }
696
697 #[inline]
698 fn consume_balanced_segment_in_string(
699 &mut self,
700 open: char,
701 close: char,
702 terminator: char,
703 ) -> Option<usize> {
704 if self.current_char() != Some(open) {
705 return None;
706 }
707
708 let mut depth = 1usize;
709 self.advance();
710 while let Some(ch) = self.current_char() {
711 match ch {
712 '\\' => {
713 self.advance();
714 if self.current_char().is_some() {
715 self.advance();
716 }
717 }
718 c if c == terminator => {
719 // Local recovery for interpolation tails in quoted strings:
720 // stop at the closing quote so the outer string parser can
721 // still terminate this token cleanly.
722 return None;
723 }
724 c if c == open => {
725 depth += 1;
726 self.advance();
727 }
728 c if c == close => {
729 self.advance();
730 depth -= 1;
731 if depth == 0 {
732 return Some(self.position);
733 }
734 }
735 _ => self.advance(),
736 }
737 }
738
739 None
740 }
741
742 /// Fast byte-level check for ASCII characters
743 #[inline]
744 fn peek_byte(&self, offset: usize) -> Option<u8> {
745 if offset > self.config.max_lookahead {
746 return None;
747 }
748
749 let pos = self.position.checked_add(offset)?;
750 if pos < self.input_bytes.len() { Some(self.input_bytes[pos]) } else { None }
751 }
752
753 /// Check if the next bytes match a pattern (ASCII only)
754 #[inline]
755 fn matches_bytes(&self, pattern: &[u8]) -> bool {
756 let Some(end_offset) = pattern.len().checked_sub(1) else {
757 return true;
758 };
759
760 if end_offset > self.config.max_lookahead {
761 return false;
762 }
763
764 let Some(end) = self.position.checked_add(pattern.len()) else {
765 return false;
766 };
767
768 if end <= self.input_bytes.len() {
769 &self.input_bytes[self.position..end] == pattern
770 } else {
771 false
772 }
773 }
774
775 #[inline]
776 fn skip_whitespace_and_comments(&mut self) -> Option<()> {
777 // Don't reset after_newline if we're at the start of a line
778 if self.position > 0 && self.position != self.line_start_offset {
779 self.after_newline = false;
780 }
781
782 while self.position < self.input_bytes.len() {
783 let byte = Self::byte_at(self.input_bytes, self.position);
784 match byte {
785 // Fast path for ASCII whitespace - batch process
786 b' ' => {
787 // Batch skip spaces for better cache efficiency
788 let start = self.position;
789 while self.position < self.input_bytes.len()
790 && Self::byte_at(self.input_bytes, self.position) == b' '
791 {
792 self.position += 1;
793 }
794 // Continue outer loop if we processed any spaces
795 if self.position > start {
796 // Loop naturally continues to next iteration
797 }
798 }
799 b'\t' | 0x0B | 0x0C => {
800 // Batch skip horizontal tab, vertical tab, and form feed.
801 // Perl treats these as whitespace separators.
802 let start = self.position;
803 while self.position < self.input_bytes.len()
804 && matches!(
805 Self::byte_at(self.input_bytes, self.position),
806 b'\t' | 0x0B | 0x0C
807 )
808 {
809 self.position += 1;
810 }
811 if self.position > start {
812 // Loop naturally continues to next iteration
813 }
814 }
815 b'\r' | b'\n' => {
816 self.consume_newline();
817
818 // Set body_start for the FIRST pending heredoc that needs it (FIFO)
819 // Only check if we have pending heredocs to avoid unnecessary work
820 if !self.pending_heredocs.is_empty() {
821 for spec in &mut self.pending_heredocs {
822 if spec.body_start == 0 {
823 spec.body_start = self.position;
824 break; // Only set for the first unresolved heredoc
825 }
826 }
827 }
828 }
829 b'#' => {
830 // In ExpectDelimiter mode, '#' is a delimiter, not a comment
831 if matches!(self.mode, LexerMode::ExpectDelimiter) {
832 break;
833 }
834
835 // Skip line comment using memchr for fast newline search
836 self.position += 1; // Skip # directly
837
838 // Use memchr2 to find CR/LF line endings quickly (supports LF, CRLF, and CR)
839 if let Some(newline_offset) =
840 memchr::memchr2(b'\n', b'\r', &self.input_bytes[self.position..])
841 {
842 self.position += newline_offset;
843 } else {
844 // No newline found, skip to end
845 self.position = self.input_bytes.len();
846 }
847 }
848 b'=' if self.position == 0
849 || (self.position > 0
850 && matches!(self.input_bytes[self.position - 1], b'\n' | b'\r')) =>
851 {
852 // Check if this starts a POD section (=pod, =head, =over, etc.)
853 // Use byte-safe checks — avoid slicing &str at arbitrary byte positions
854 let remaining = &self.input_bytes[self.position..];
855 if remaining.starts_with(b"=pod")
856 || remaining.starts_with(b"=head")
857 || remaining.starts_with(b"=over")
858 || remaining.starts_with(b"=item")
859 || remaining.starts_with(b"=back")
860 || remaining.starts_with(b"=begin")
861 || remaining.starts_with(b"=end")
862 || remaining.starts_with(b"=for")
863 || remaining.starts_with(b"=encoding")
864 {
865 // Scan forward for \n=cut (end of POD block)
866 let search_start = self.position;
867 let mut found_cut = false;
868 let bytes = self.input_bytes;
869 let mut i = search_start;
870 while i < bytes.len() {
871 // Look for =cut at the start of a line
872 if (i == 0 || matches!(bytes[i - 1], b'\n' | b'\r'))
873 && bytes[i..].starts_with(b"=cut")
874 {
875 i += 4; // Skip "=cut"
876 // Skip rest of the =cut line
877 while i < bytes.len() && bytes[i] != b'\n' && bytes[i] != b'\r' {
878 i += 1;
879 }
880 // Consume one line ending sequence if present
881 if i < bytes.len() && bytes[i] == b'\r' {
882 i += 1;
883 if i < bytes.len() && bytes[i] == b'\n' {
884 i += 1;
885 }
886 } else if i < bytes.len() && bytes[i] == b'\n' {
887 i += 1;
888 }
889 self.position = i;
890 found_cut = true;
891 break;
892 }
893 i += 1;
894 }
895 if !found_cut {
896 // POD extends to end of file
897 self.position = bytes.len();
898 }
899 continue;
900 }
901 // Not a POD directive - regular '=' token
902 break;
903 }
904 _ => {
905 // For non-ASCII whitespace, use char check only when needed
906 if byte >= 128
907 && let Some(ch) = self.current_char()
908 && ch.is_whitespace()
909 {
910 self.advance();
911 continue;
912 }
913 break;
914 }
915 }
916 }
917 Some(())
918 }
919
920 fn try_heredoc(&mut self) -> Option<Token> {
921 // `<<` is the left-shift operator, not a heredoc, when we are inside
922 // a parenthesized expression and have just finished a term.
923 // E.g. `(1<<index(...))` — the `1` sets ExpectOperator and paren_depth > 0,
924 // so `<<index` must be the bitshift operator, not a heredoc start.
925 //
926 // We must NOT fire the guard at statement level (paren_depth == 0) because
927 // `print $fh <<END` is valid Perl: `$fh` sets ExpectOperator but `<<END`
928 // is a heredoc. The depth check distinguishes the two cases.
929 if self.mode == LexerMode::ExpectOperator && self.paren_depth > 0 {
930 return None;
931 }
932
933 // Check for heredoc start
934 if self.peek_byte(0) != Some(b'<') || self.peek_byte(1) != Some(b'<') {
935 return None;
936 }
937
938 let start = self.position;
939 let mut text = String::from("<<");
940 self.position += 2; // Skip <<
941
942 // Check for indented heredoc (~)
943 let allow_indent = if self.current_char() == Some('~') {
944 text.push('~');
945 self.advance();
946 true
947 } else {
948 false
949 };
950
951 // Skip whitespace
952 while let Some(ch) = self.current_char() {
953 if ch == ' ' || ch == '\t' {
954 text.push(ch);
955 self.advance();
956 } else {
957 break;
958 }
959 }
960
961 // Optional backslash disables interpolation, treat like single-quoted label
962 let backslashed = if self.current_char() == Some('\\') {
963 text.push('\\');
964 self.advance();
965 true
966 } else {
967 false
968 };
969
970 // Parse delimiter
971 let delimiter = if self.position < self.input.len() {
972 match self.current_char() {
973 Some('"') if !backslashed => self.parse_quoted_heredoc_delimiter('"', &mut text)?,
974 Some('\'') if !backslashed => {
975 self.parse_quoted_heredoc_delimiter('\'', &mut text)?
976 }
977 Some('`') if !backslashed => self.parse_quoted_heredoc_delimiter('`', &mut text)?,
978 Some(c) if is_perl_identifier_start(c) => {
979 // Bare word delimiter
980 let mut delim = String::new();
981 while self.position < self.input.len() {
982 if let Some(c) = self.current_char() {
983 if is_perl_identifier_continue(c) {
984 delim.push(c);
985 text.push(c);
986 self.advance();
987 } else {
988 break;
989 }
990 } else {
991 break;
992 }
993 }
994 delim
995 }
996 _ => {
997 // Not a valid heredoc delimiter - reset position and return None
998 // This allows << to be parsed as bitshift operator (e.g., 1 << 2)
999 self.position = start;
1000 return None;
1001 }
1002 }
1003 } else {
1004 // No delimiter found - reset position and return None
1005 self.position = start;
1006 return None;
1007 };
1008
1009 // For now, return a placeholder token
1010 // The actual heredoc body would be parsed later when we encounter it
1011 self.mode = LexerMode::ExpectOperator;
1012
1013 // Recursion depth limit (Issue #443)
1014 if self.pending_heredocs.len() >= MAX_HEREDOC_DEPTH {
1015 return Some(Token {
1016 token_type: TokenType::Error(Arc::from("Heredoc nesting too deep")),
1017 text: Arc::from(text),
1018 start,
1019 end: self.position,
1020 });
1021 }
1022
1023 // Queue the heredoc spec with its label
1024 self.pending_heredocs.push(HeredocSpec {
1025 label: Arc::from(delimiter.as_str()),
1026 body_start: 0, // Will be set when we see the newline after this line
1027 allow_indent,
1028 });
1029
1030 Some(Token {
1031 token_type: TokenType::HeredocStart,
1032 text: Arc::from(text),
1033 start,
1034 end: self.position,
1035 })
1036 }
1037
1038 fn try_string(&mut self) -> Option<Token> {
1039 let start = self.position;
1040 let quote = self.current_char()?;
1041
1042 match quote {
1043 '"' => self.parse_double_quoted_string(start),
1044 '\'' => self.parse_single_quoted_string(start),
1045 '`' => self.parse_backtick_string(start),
1046 'q' if self.peek_char(1) == Some('{') => self.parse_q_string(start),
1047 _ => None,
1048 }
1049 }
1050
1051 #[inline]
1052 fn try_number(&mut self) -> Option<Token> {
1053 let start = self.position;
1054
1055 // Fast byte check for digits - optimized bounds checking
1056 let bytes = self.input_bytes;
1057 if self.position >= bytes.len() || !Self::byte_at(bytes, self.position).is_ascii_digit() {
1058 return None;
1059 }
1060
1061 // Check for hex (0x), binary (0b), or octal (0o) prefixes
1062 let mut pos = self.position;
1063 if Self::byte_at(bytes, pos) == b'0' && pos + 1 < bytes.len() {
1064 let prefix_byte = bytes[pos + 1];
1065 if prefix_byte == b'x' || prefix_byte == b'X' {
1066 // Hexadecimal: 0x[0-9a-fA-F_]+
1067 pos += 2; // consume '0x'
1068 let digit_start = pos;
1069 let mut saw_digit = false;
1070 while pos < bytes.len() && (bytes[pos].is_ascii_hexdigit() || bytes[pos] == b'_') {
1071 saw_digit |= bytes[pos].is_ascii_hexdigit();
1072 pos += 1;
1073 }
1074 if pos > digit_start && saw_digit {
1075 self.position = pos;
1076 let text = &self.input[start..self.position];
1077 self.mode = LexerMode::ExpectOperator;
1078 return Some(Token {
1079 token_type: TokenType::Number(Arc::from(text)),
1080 text: Arc::from(text),
1081 start,
1082 end: self.position,
1083 });
1084 }
1085 // No hex digits after 0x - fall through to parse '0' as decimal
1086 } else if prefix_byte == b'b' || prefix_byte == b'B' {
1087 // Binary: 0b[01_]+
1088 pos += 2; // consume '0b'
1089 let digit_start = pos;
1090 let mut saw_digit = false;
1091 while pos < bytes.len()
1092 && (bytes[pos] == b'0' || bytes[pos] == b'1' || bytes[pos] == b'_')
1093 {
1094 saw_digit |= bytes[pos] == b'0' || bytes[pos] == b'1';
1095 pos += 1;
1096 }
1097 if pos > digit_start && saw_digit {
1098 self.position = pos;
1099 let text = &self.input[start..self.position];
1100 self.mode = LexerMode::ExpectOperator;
1101 return Some(Token {
1102 token_type: TokenType::Number(Arc::from(text)),
1103 text: Arc::from(text),
1104 start,
1105 end: self.position,
1106 });
1107 }
1108 // No binary digits after 0b - fall through to parse '0' as decimal
1109 } else if prefix_byte == b'o' || prefix_byte == b'O' {
1110 // Octal (explicit): 0o[0-7_]+
1111 pos += 2; // consume '0o'
1112 let digit_start = pos;
1113 let mut saw_digit = false;
1114 while pos < bytes.len()
1115 && ((bytes[pos] >= b'0' && bytes[pos] <= b'7') || bytes[pos] == b'_')
1116 {
1117 saw_digit |= (b'0'..=b'7').contains(&bytes[pos]);
1118 pos += 1;
1119 }
1120 if pos > digit_start && saw_digit {
1121 self.position = pos;
1122 let text = &self.input[start..self.position];
1123 self.mode = LexerMode::ExpectOperator;
1124 return Some(Token {
1125 token_type: TokenType::Number(Arc::from(text)),
1126 text: Arc::from(text),
1127 start,
1128 end: self.position,
1129 });
1130 }
1131 // No octal digits after 0o - fall through to parse '0' as decimal
1132 }
1133 }
1134
1135 // Consume initial digits - unrolled for better performance
1136 pos = self.position;
1137 while pos < bytes.len() {
1138 let byte = Self::byte_at(bytes, pos);
1139 if byte.is_ascii_digit() || byte == b'_' {
1140 pos += 1;
1141 } else {
1142 break;
1143 }
1144 }
1145 self.position = pos;
1146
1147 // Check for decimal point - optimized with single bounds check
1148 if pos < bytes.len() && Self::byte_at(bytes, pos) == b'.' {
1149 // Peek ahead to see what follows the dot
1150 let has_following_digit = pos + 1 < bytes.len() && bytes[pos + 1].is_ascii_digit();
1151
1152 // Optimized dot consumption logic
1153 let should_consume_dot = has_following_digit || {
1154 pos + 1 >= bytes.len() || {
1155 // Use bitwise operations for faster character classification
1156 let next_byte = bytes[pos + 1];
1157 // Whitespace, delimiters, operators - optimized check
1158 next_byte <= b' '
1159 || matches!(
1160 next_byte,
1161 b';' | b','
1162 | b')'
1163 | b'}'
1164 | b']'
1165 | b'+'
1166 | b'-'
1167 | b'*'
1168 | b'/'
1169 | b'%'
1170 | b'='
1171 | b'<'
1172 | b'>'
1173 | b'!'
1174 | b'&'
1175 | b'|'
1176 | b'^'
1177 | b'~'
1178 | b'e'
1179 | b'E'
1180 )
1181 }
1182 };
1183
1184 if should_consume_dot {
1185 pos += 1; // consume the dot
1186 // Consume fractional digits - batch processing
1187 while pos < bytes.len() && (bytes[pos].is_ascii_digit() || bytes[pos] == b'_') {
1188 pos += 1;
1189 }
1190 self.position = pos;
1191 }
1192 }
1193
1194 // Check for exponent - optimized
1195 if pos < bytes.len() && (bytes[pos] == b'e' || bytes[pos] == b'E') {
1196 let exp_start = pos;
1197 pos += 1; // consume 'e' or 'E'
1198
1199 // Check for optional sign
1200 if pos < bytes.len() && (bytes[pos] == b'+' || bytes[pos] == b'-') {
1201 pos += 1;
1202 }
1203
1204 // Must have at least one digit after exponent (underscores allowed between digits)
1205 let mut saw_digit = false;
1206 while pos < bytes.len() {
1207 let byte = bytes[pos];
1208 if byte.is_ascii_digit() {
1209 saw_digit = true;
1210 pos += 1;
1211 } else if byte == b'_' {
1212 pos += 1;
1213 } else {
1214 break;
1215 }
1216 }
1217
1218 // If no digits after exponent, backtrack
1219 if !saw_digit {
1220 pos = exp_start;
1221 }
1222
1223 self.position = pos;
1224 }
1225
1226 // Avoid string slicing for common number cases - use Arc::from directly on slice
1227 let text = &self.input[start..self.position];
1228 self.mode = LexerMode::ExpectOperator;
1229
1230 Some(Token {
1231 token_type: TokenType::Number(Arc::from(text)),
1232 text: Arc::from(text),
1233 start,
1234 end: self.position,
1235 })
1236 }
1237
1238 fn parse_decimal_number(&mut self, start: usize) -> Option<Token> {
1239 // We're at the dot, consume it
1240 self.advance();
1241
1242 // Parse the fractional part
1243 while self.position < self.input_bytes.len() {
1244 let byte = self.input_bytes[self.position];
1245 match byte {
1246 b'0'..=b'9' | b'_' => self.position += 1,
1247 b'e' | b'E' => {
1248 // Handle scientific notation.
1249 // Save the position of 'e'/'E' so we can backtrack here if
1250 // no digits follow the exponent marker (with or without sign).
1251 let e_pos = self.position;
1252 self.advance();
1253 if self.position < self.input_bytes.len() {
1254 let next = self.input_bytes[self.position];
1255 if next == b'+' || next == b'-' {
1256 self.advance();
1257 }
1258 }
1259 // Parse exponent digits (underscores allowed between digits)
1260 let exponent_start = self.position;
1261 let mut saw_digit = false;
1262 while self.position < self.input_bytes.len() {
1263 let byte = self.input_bytes[self.position];
1264 if byte.is_ascii_digit() {
1265 saw_digit = true;
1266 self.position += 1;
1267 } else if byte == b'_' {
1268 self.position += 1;
1269 } else {
1270 break;
1271 }
1272 }
1273
1274 // No digits after exponent marker — backtrack to just before
1275 // 'e'/'E' so the caller sees it as a separate token.
1276 // Using e_pos (not exponent_start-1) avoids including 'e' in
1277 // the number slice when a sign character was consumed.
1278 if !saw_digit {
1279 let _ = exponent_start; // mark as intentionally unused
1280 self.position = e_pos;
1281 }
1282 break;
1283 }
1284 _ => break,
1285 }
1286 }
1287
1288 let text = &self.input[start..self.position];
1289 self.mode = LexerMode::ExpectOperator;
1290
1291 Some(Token {
1292 token_type: TokenType::Number(Arc::from(text)),
1293 text: Arc::from(text),
1294 start,
1295 end: self.position,
1296 })
1297 }
1298
1299 fn try_variable(&mut self) -> Option<Token> {
1300 let start = self.position;
1301 let sigil = self.current_char()?;
1302
1303 match sigil {
1304 '$' | '@' | '%' | '*' => {
1305 // In ExpectOperator mode, treat % and * as operators rather than sigils
1306 if self.mode == LexerMode::ExpectOperator && matches!(sigil, '*' | '%') {
1307 return None;
1308 }
1309 self.advance();
1310
1311 // Special case: After ->, sigils followed by { or [ should be tokenized separately
1312 // This is for postfix dereference like ->@*, ->%{}, ->@[]
1313 // We need to be careful with Unicode - check if we have enough bytes and valid char boundaries
1314 let check_arrow = self.position >= 3
1315 && self.position.saturating_sub(1) <= self.input.len()
1316 && self.input.is_char_boundary(self.position.saturating_sub(3))
1317 && self.input.is_char_boundary(self.position.saturating_sub(1));
1318
1319 if check_arrow
1320 && {
1321 let saved = self.position;
1322 self.position -= 3;
1323 let arrow = self.matches_bytes(b"->");
1324 self.position = saved;
1325 arrow
1326 }
1327 && matches!(self.current_char(), Some('{' | '[' | '*'))
1328 {
1329 // Just return the sigil
1330 let text = &self.input[start..self.position];
1331 self.mode = LexerMode::ExpectOperator;
1332
1333 return Some(Token {
1334 token_type: TokenType::Identifier(Arc::from(text)),
1335 text: Arc::from(text),
1336 start,
1337 end: self.position,
1338 });
1339 }
1340
1341 // Check for $# (array length operator)
1342 if sigil == '$' && self.current_char() == Some('#') {
1343 self.advance(); // consume #
1344 // Now parse the array name
1345 while let Some(ch) = self.current_char() {
1346 if is_perl_identifier_continue(ch) {
1347 self.advance();
1348 } else if ch == ':' && self.peek_char(1) == Some(':') {
1349 // Package-qualified array name
1350 self.advance();
1351 self.advance();
1352 } else {
1353 break;
1354 }
1355 }
1356
1357 let text = &self.input[start..self.position];
1358 self.mode = LexerMode::ExpectOperator;
1359 // $#foo is a complete variable token; a following `{` is a subscript.
1360 self.after_var_subscript = true;
1361
1362 return Some(Token {
1363 token_type: TokenType::Identifier(Arc::from(text)),
1364 text: Arc::from(text),
1365 start,
1366 end: self.position,
1367 });
1368 }
1369
1370 // Check for special cases like ${^MATCH} or ${::{foo}} or *{$glob}
1371 if self.current_char() == Some('{') {
1372 // Peek ahead to decide if we should consume the brace
1373 let next_char = self.peek_char(1);
1374
1375 // Check if this is a dereference like @{$ref} or @{[...]}
1376 // If the next char suggests dereference, don't consume the brace.
1377 // For @ and % sigils, identifiers inside braces are also derefs
1378 // (e.g. @{Foo::Bar::baz} or %{Some::Hash}).
1379 let is_deref = sigil != '*'
1380 && (matches!(
1381 next_char,
1382 Some('$' | '@' | '%' | '*' | '&' | '[' | ' ' | '\t' | '\n' | '\r',)
1383 ) || (matches!(sigil, '@' | '%')
1384 && next_char.is_some_and(is_perl_identifier_start)));
1385 if is_deref {
1386 // This is a dereference, don't consume the brace
1387 let text = &self.input[start..self.position];
1388 self.mode = LexerMode::ExpectOperator;
1389 // A standalone sigil token before `{` starts a dereference
1390 // sequence (e.g. `${$ref}` / `@{$aref}` / `%{$href}` / `&{$cref}`).
1391 // Mark it as subscript-capable so `{` increments brace depth
1392 // and the closing `}` can enable chained `{...}` subscripts.
1393 // (Broader form than master's `$|@|%` filter — `*` is already
1394 // excluded by the `is_deref` guard above and `&` deref also
1395 // benefits from chained-subscript handling.)
1396 self.after_var_subscript = true;
1397
1398 return Some(Token {
1399 token_type: TokenType::Identifier(Arc::from(text)),
1400 text: Arc::from(text),
1401 start,
1402 end: self.position,
1403 });
1404 }
1405
1406 self.advance(); // consume {
1407
1408 // Handle special variables with caret
1409 if self.current_char() == Some('^') {
1410 self.advance(); // consume ^
1411 // Parse the special variable name
1412 while let Some(ch) = self.current_char() {
1413 if ch == '}' {
1414 self.advance(); // consume }
1415 break;
1416 } else if is_perl_identifier_continue(ch) {
1417 self.advance();
1418 } else {
1419 break;
1420 }
1421 }
1422 }
1423 // Handle stash access like $::{foo}
1424 else if self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1425 self.advance(); // consume first :
1426 self.advance(); // consume second :
1427 // Skip optional { and }
1428 if self.current_char() == Some('{') {
1429 self.advance();
1430 }
1431 // Parse the name
1432 while let Some(ch) = self.current_char() {
1433 if ch == '}' {
1434 self.advance();
1435 if self.current_char() == Some('}') {
1436 self.advance(); // consume closing } of ${...}
1437 }
1438 break;
1439 } else if is_perl_identifier_continue(ch) {
1440 self.advance();
1441 } else {
1442 break;
1443 }
1444 }
1445 }
1446 // Regular braced variable like ${foo} or glob like *{$glob}
1447 else {
1448 // Check if this is a dereference like ${$ref} or @{$ref} or @{[...]}
1449 // If the next char is a sigil or other expression starter, we should stop here and let the parser handle it
1450 // EXCEPT for globs - *{$glob} should be parsed as one token
1451 // Also check for empty braces or EOF - in these cases we should split the tokens
1452 if sigil != '*'
1453 && !self.current_char().is_some_and(is_perl_identifier_start)
1454 {
1455 // This is a dereference or empty/invalid brace, backtrack
1456 self.position = start + 1; // Just past the sigil
1457 let text = &self.input[start..self.position];
1458 self.mode = LexerMode::ExpectOperator;
1459 // Same as above: sigil-only token means a dereference opener.
1460 self.after_var_subscript = true;
1461
1462 return Some(Token {
1463 token_type: TokenType::Identifier(Arc::from(text)),
1464 text: Arc::from(text),
1465 start,
1466 end: self.position,
1467 });
1468 }
1469
1470 // For glob access, we need to consume everything inside braces
1471 if sigil == '*' {
1472 let mut brace_depth: usize = 1;
1473 while let Some(ch) = self.current_char() {
1474 if ch == '{' {
1475 brace_depth += 1;
1476 } else if ch == '}' {
1477 brace_depth = brace_depth.saturating_sub(1);
1478 if brace_depth == 0 {
1479 self.advance(); // consume final }
1480 break;
1481 }
1482 }
1483 self.advance();
1484 }
1485 } else {
1486 // Regular variable
1487 while let Some(ch) = self.current_char() {
1488 if ch == '}' {
1489 self.advance(); // consume }
1490 break;
1491 } else if is_perl_identifier_continue(ch) {
1492 self.advance();
1493 } else {
1494 break;
1495 }
1496 }
1497 }
1498 }
1499 }
1500 // Parse regular variable name
1501 else if let Some(ch) = self.current_char() {
1502 if is_perl_identifier_start(ch) {
1503 while let Some(ch) = self.current_char() {
1504 if is_perl_identifier_continue(ch) {
1505 self.advance();
1506 } else {
1507 break;
1508 }
1509 }
1510 // Handle package-qualified segments like Foo::bar
1511 while self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1512 self.advance();
1513 self.advance();
1514 while let Some(ch) = self.current_char() {
1515 if is_perl_identifier_continue(ch) {
1516 self.advance();
1517 } else {
1518 break;
1519 }
1520 }
1521 }
1522 }
1523 // Handle $^Letter (e.g. $^W, $^O, $^X) and bare $^ (format_top_name)
1524 // Not inside prototypes where ^ is a literal prototype char
1525 else if sigil == '$' && ch == '^' && !self.in_prototype {
1526 self.advance(); // consume ^
1527 // $^Letter: consume the single uppercase letter
1528 if let Some(letter) = self.current_char()
1529 && letter.is_ascii_uppercase()
1530 {
1531 self.advance();
1532 }
1533 // bare $^ (no uppercase letter follows): format_top_name — stop here
1534 }
1535 // Handle special punctuation variables
1536 // Not inside prototypes where ; and , are literal prototype chars
1537 else if sigil == '$'
1538 && !self.in_prototype
1539 && matches!(
1540 ch,
1541 '?' | '!'
1542 | '@'
1543 | '&'
1544 | '`'
1545 | '\''
1546 | '.'
1547 | '/'
1548 | '\\'
1549 | '|'
1550 | '+'
1551 | '-'
1552 | '['
1553 | ']'
1554 | '$'
1555 | '~'
1556 | '='
1557 | '%'
1558 | ','
1559 | '"'
1560 | ';'
1561 | '>'
1562 | '<'
1563 | ')'
1564 | '(' // $( = real group ID of this process
1565 )
1566 {
1567 self.advance(); // consume the special character
1568 }
1569 // $$ is the PID special variable, but only when it is not immediately
1570 // followed by an identifier-start character. $$var is scalar dereference
1571 // of $var, so keep the second $ for the next token.
1572 else if sigil == '$' && ch == '$' {
1573 if !self.peek_char(1).is_some_and(is_perl_identifier_start) {
1574 self.advance(); // consume the second $ for bare $$ PID
1575 }
1576 }
1577 // Handle special array/hash punctuation variables
1578 else if (sigil == '@' || sigil == '%') && matches!(ch, '+' | '-') {
1579 self.advance(); // consume the + or -
1580 }
1581 }
1582
1583 let text = &self.input[start..self.position];
1584 self.mode = LexerMode::ExpectOperator;
1585 // A complete $foo, @foo, %foo token can be followed by a hash/slice
1586 // subscript `{`. Set the flag so the `{` handler knows to increment
1587 // hash_brace_depth. Glob tokens (*foo) are excluded: they don't take
1588 // hash subscripts in the same way.
1589 self.after_var_subscript = matches!(sigil, '$' | '@' | '%');
1590
1591 Some(Token {
1592 token_type: TokenType::Identifier(Arc::from(text)),
1593 text: Arc::from(text),
1594 start,
1595 end: self.position,
1596 })
1597 }
1598 _ => None,
1599 }
1600 }
1601
1602 /// Return the next non-space char and the char immediately following it (without consuming).
1603 /// Used to detect quote-operator delimiters while distinguishing `=>` (fat-arrow autoquote)
1604 /// from `=` used as a plain delimiter.
1605 fn peek_nonspace_and_following(&self) -> (Option<char>, Option<char>) {
1606 let mut i = self.position;
1607 while i < self.input.len() {
1608 let c = match self.input.get(i..).and_then(|s| s.chars().next()) {
1609 Some(c) => c,
1610 None => return (None, None),
1611 };
1612 if c.is_whitespace() {
1613 i += c.len_utf8();
1614 continue;
1615 }
1616 // Found non-space at position i; peek the next char after it
1617 let j = i + c.len_utf8();
1618 let following = self.input.get(j..).and_then(|s| s.chars().next());
1619 return (Some(c), following);
1620 }
1621 (None, None)
1622 }
1623
1624 /// Is `c` a valid quote-like delimiter? (non-alnum, including paired)
1625 fn is_quote_delim(c: char) -> bool {
1626 // Perl allows any non-alphanumeric, non-whitespace character as delimiter,
1627 // including control characters (e.g. s\x07pattern\x07replacement\x07).
1628 !c.is_ascii_alphanumeric() && !c.is_whitespace()
1629 }
1630
1631 /// Try to parse a v-string (version string) like `v5.26.0` or `v5.10`.
1632 ///
1633 /// A v-string starts with `v` followed by one or more digits, then optionally
1634 /// `.` followed by digits, repeated. The `v` prefix distinguishes these from
1635 /// normal identifiers. Examples: `v5.26.0`, `v5.10`, `v1.2.3.4`.
1636 #[inline]
1637 fn try_vstring(&mut self) -> Option<Token> {
1638 let start = self.position;
1639 let bytes = self.input_bytes;
1640
1641 // Must start with 'v' followed by at least one digit
1642 if start >= bytes.len() || bytes[start] != b'v' {
1643 return None;
1644 }
1645
1646 let next_pos = start + 1;
1647 if next_pos >= bytes.len() || !bytes[next_pos].is_ascii_digit() {
1648 return None;
1649 }
1650
1651 // We have `v` followed by a digit — scan the rest of the v-string.
1652 // Pattern: v DIGITS (.DIGITS)*
1653 let mut pos = next_pos;
1654
1655 // Consume leading digits
1656 while pos < bytes.len() && bytes[pos].is_ascii_digit() {
1657 pos += 1;
1658 }
1659
1660 // Consume optional `.DIGITS` segments (require at least one digit after dot)
1661 while pos < bytes.len() && bytes[pos] == b'.' {
1662 let dot_pos = pos;
1663 pos += 1; // skip '.'
1664
1665 if pos >= bytes.len() || !bytes[pos].is_ascii_digit() {
1666 // Dot not followed by digit — not part of the v-string
1667 pos = dot_pos;
1668 break;
1669 }
1670
1671 // Consume digits after the dot
1672 while pos < bytes.len() && bytes[pos].is_ascii_digit() {
1673 pos += 1;
1674 }
1675 }
1676
1677 // Make sure the v-string isn't followed by identifier-continuation characters
1678 // (e.g. `v5x` should remain an identifier, not a v-string `v5` + `x`)
1679 if pos < bytes.len() {
1680 let next_byte = bytes[pos];
1681 if next_byte == b'_' || next_byte.is_ascii_alphabetic() {
1682 return None;
1683 }
1684 // Also check for non-ASCII identifier continuations
1685 if next_byte >= 128
1686 && let Some(ch) = self.input.get(pos..).and_then(|s| s.chars().next())
1687 && is_perl_identifier_continue(ch)
1688 {
1689 return None;
1690 }
1691 }
1692
1693 // `v5` (no dots) is a valid Perl v-string meaning chr(5).
1694 let text = &self.input[start..pos];
1695
1696 self.position = pos;
1697 self.mode = LexerMode::ExpectOperator;
1698
1699 Some(Token {
1700 token_type: TokenType::Version(Arc::from(text)),
1701 text: Arc::from(text),
1702 start,
1703 end: self.position,
1704 })
1705 }
1706
1707 #[inline]
1708 fn apostrophe_starts_legacy_package_segment(&self, position: usize) -> bool {
1709 let next_position = position + '\''.len_utf8();
1710 self.input
1711 .get(next_position..)
1712 .and_then(|suffix| suffix.chars().next())
1713 .is_some_and(is_perl_identifier_start)
1714 }
1715
1716 #[inline]
1717 fn try_identifier_or_keyword(&mut self) -> Option<Token> {
1718 let start = self.position;
1719 let ch = self.current_char()?;
1720 let bytes = self.input_bytes;
1721 let len = bytes.len();
1722
1723 if is_perl_identifier_start(ch) {
1724 // Special case: substitution/transliteration with single-quote delimiter
1725 // The single quote is considered an identifier continuation, so we need to
1726 // detect these operators before consuming it as part of an identifier.
1727 if !self.after_arrow
1728 && self.hash_brace_depth == 0
1729 && ch == 's'
1730 && self.peek_char(1) == Some('\'')
1731 {
1732 self.advance(); // consume 's'
1733 return self.parse_substitution(start);
1734 } else if !self.after_arrow
1735 && self.hash_brace_depth == 0
1736 && ch == 'y'
1737 && self.peek_char(1) == Some('\'')
1738 {
1739 self.advance(); // consume 'y'
1740 return self.parse_transliteration(start);
1741 } else if !self.after_arrow
1742 && self.hash_brace_depth == 0
1743 && ch == 't'
1744 && self.peek_char(1) == Some('r')
1745 && self.peek_char(2) == Some('\'')
1746 {
1747 self.advance(); // consume 't'
1748 self.advance(); // consume 'r'
1749 return self.parse_transliteration(start);
1750 }
1751
1752 // Fast ASCII path for identifier continuation.
1753 while self.position < len {
1754 let byte = bytes[self.position];
1755 if byte == b'\'' {
1756 if is_quote_op_word_prefix(&bytes[start..self.position])
1757 || !self.apostrophe_starts_legacy_package_segment(self.position)
1758 {
1759 // Keep apostrophe for quote/string parsing in cases like q'...'
1760 // and split' ', while still accepting Foo'Bar package spelling.
1761 break;
1762 }
1763 self.position += 1;
1764 continue;
1765 }
1766
1767 if byte.is_ascii_alphanumeric() || byte == b'_' {
1768 self.position += 1;
1769 continue;
1770 }
1771
1772 if byte < 128 {
1773 break;
1774 }
1775
1776 if let Some(ch) = self.current_char()
1777 && is_perl_identifier_continue(ch)
1778 {
1779 self.advance();
1780 continue;
1781 }
1782 break;
1783 }
1784 // Handle package-qualified identifiers like Foo::bar.
1785 while self.config.max_lookahead >= 1
1786 && self.position + 1 < len
1787 && bytes[self.position] == b':'
1788 && bytes[self.position + 1] == b':'
1789 {
1790 self.position += 2; // consume '::'
1791
1792 // consume following identifier segment if present
1793 let Some(ch) = self.current_char() else {
1794 break;
1795 };
1796 if !is_perl_identifier_start(ch) {
1797 break;
1798 }
1799 self.advance();
1800 while self.position < len {
1801 let byte = bytes[self.position];
1802 if byte == b'\'' {
1803 if !self.apostrophe_starts_legacy_package_segment(self.position) {
1804 break;
1805 }
1806 self.position += 1;
1807 continue;
1808 }
1809
1810 if byte.is_ascii_alphanumeric() || byte == b'_' {
1811 self.position += 1;
1812 continue;
1813 }
1814 if byte < 128 {
1815 break;
1816 }
1817 if let Some(ch) = self.current_char()
1818 && is_perl_identifier_continue(ch)
1819 {
1820 self.advance();
1821 continue;
1822 }
1823 break;
1824 }
1825 }
1826
1827 let text = &self.input[start..self.position];
1828
1829 // Check for __DATA__ and __END__ markers using exact match
1830 // Only recognize these in code channel, not inside data/format sections or heredocs
1831 let in_code_channel =
1832 !matches!(self.mode, LexerMode::InDataSection | LexerMode::InFormatBody)
1833 && self.pending_heredocs.is_empty();
1834
1835 let marker = if in_code_channel {
1836 if text == "__DATA__" {
1837 Some("__DATA__")
1838 } else if text == "__END__" {
1839 Some("__END__")
1840 } else {
1841 None
1842 }
1843 } else {
1844 None
1845 };
1846
1847 if let Some(marker_text) = marker {
1848 // These must be at the beginning of a line
1849 // Use the after_newline flag to determine if we're at line start
1850 if self.after_newline {
1851 // Check if rest of line is only whitespace
1852 // Only treat as data marker if line has no trailing junk
1853 if Self::trailing_ws_only(self.input_bytes, self.position) {
1854 // Consume the rest of the line (the marker line)
1855 while self.position < self.input.len()
1856 && self.input_bytes[self.position] != b'\n'
1857 && self.input_bytes[self.position] != b'\r'
1858 {
1859 self.advance();
1860 }
1861 self.consume_newline();
1862
1863 // Switch to data section mode
1864 self.mode = LexerMode::InDataSection;
1865
1866 return Some(Token {
1867 token_type: TokenType::DataMarker(Arc::from(marker_text)),
1868 text: Arc::from(marker_text),
1869 start,
1870 end: self.position,
1871 });
1872 }
1873 }
1874 }
1875
1876 // Check for substitution/transliteration operators
1877 // Skip if after '->' -- these are method names, not operators.
1878 #[allow(clippy::collapsible_if)]
1879 if !self.after_arrow && self.hash_brace_depth == 0 && matches!(text, "s" | "tr" | "y") {
1880 let immediate = self.current_char();
1881 let (candidate, char_after_next, has_whitespace) =
1882 if immediate.is_some_and(|c| c.is_whitespace()) {
1883 let (nc, ca) = self.peek_nonspace_and_following();
1884 (nc, ca, true)
1885 } else {
1886 let following = immediate.and_then(|c| {
1887 let j = self.position + c.len_utf8();
1888 self.input.get(j..).and_then(|s| s.chars().next())
1889 });
1890 (immediate, following, false)
1891 };
1892
1893 if let Some(next) = candidate {
1894 // `s => 1` should remain a fat-arrow hash key, not quote op.
1895 let is_fat_arrow = next == '=' && char_after_next == Some('>');
1896 let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
1897 let is_quote_char = matches!(next, '\'' | '"') && text != "s";
1898 let transliteration_allows_whitespace = text == "tr" || text == "y";
1899 let substitution_disallows_whitespace = text == "s" && has_whitespace;
1900 let is_valid_delim = Self::is_quote_delim(next)
1901 && !is_fat_arrow
1902 && !substitution_disallows_whitespace
1903 && (!has_whitespace
1904 || is_paired_delim
1905 || is_quote_char
1906 || transliteration_allows_whitespace);
1907
1908 if is_valid_delim {
1909 match text {
1910 "s" => return self.parse_substitution(start),
1911 "tr" | "y" => return self.parse_transliteration(start),
1912 unexpected => {
1913 return Some(Token {
1914 token_type: TokenType::Error(Arc::from(format!(
1915 "Unexpected substitution operator '{}': expected 's', 'tr', or 'y' at position {}",
1916 unexpected, start
1917 ))),
1918 text: Arc::from(unexpected),
1919 start,
1920 end: self.position,
1921 });
1922 }
1923 }
1924 }
1925 }
1926 }
1927
1928 let token_type = if is_keyword_fast(text) {
1929 // Check for special keywords that affect lexer mode
1930 match text {
1931 "if" | "unless" | "while" | "until" | "for" | "foreach" | "grep" | "map"
1932 | "sort" | "split" | "and" | "or" | "xor" | "not"
1933 // These keywords introduce an expression, so a following `/` is a
1934 // regex, not division. `return /re/`, `die /re/`, `warn /re/`,
1935 // `do /file/`, and `eval /re/` are all valid Perl.
1936 | "return" | "die" | "warn" | "do" | "eval" => {
1937 self.mode = LexerMode::ExpectTerm;
1938 }
1939 "sub" => {
1940 self.after_sub = true;
1941 self.mode = LexerMode::ExpectTerm;
1942 }
1943 // Quote operators expect a delimiter next.
1944 // Skip if after '->' -- these are method names, not operators.
1945 // Skip inside hash subscript braces (hash_brace_depth > 0) — all
1946 // positions inside `$h{...}` or `@h{...}` treat quote-op names as
1947 // bareword keys, including after commas in slices like `@h{m, s}`.
1948 op if !self.after_arrow
1949 && self.hash_brace_depth == 0
1950 && quote_handler::is_quote_operator(op) =>
1951 {
1952 // Perl allows whitespace between a quote-like operator and its delimiter,
1953 // but ONLY for paired delimiters (s { ... } { ... }g).
1954 // For non-paired delimiters (s/foo/bar/, s,foo,bar,), the delimiter
1955 // must be immediately adjacent — otherwise `s $foo` would wrongly
1956 // treat `$` as a delimiter instead of being a bareword `s` followed
1957 // by a scalar variable.
1958 //
1959 // Strategy:
1960 // 1. Check the immediately-adjacent char first (no whitespace skip).
1961 // If it is a valid delimiter → any non-alnum, non-whitespace char.
1962 // 2. If the adjacent char is whitespace, peek past it.
1963 // Only accept PAIRED delimiters ({, [, (, <) in that case.
1964 let immediate = self.current_char();
1965 let (candidate, char_after_next, has_whitespace) =
1966 if immediate.is_some_and(|c| c.is_whitespace()) {
1967 // There is whitespace — peek past it
1968 let (nc, ca) = self.peek_nonspace_and_following();
1969 (nc, ca, true)
1970 } else {
1971 // No whitespace — use immediate char
1972 let following = immediate.and_then(|c| {
1973 let j = self.position + c.len_utf8();
1974 self.input.get(j..).and_then(|s| s.chars().next())
1975 });
1976 (immediate, following, false)
1977 };
1978
1979 if let Some(next) = candidate {
1980 // Fat-arrow autoquoting: `s => value` — `=` followed by `>` is '=>',
1981 // not a valid substitution delimiter. Treat as identifier.
1982 let is_fat_arrow = next == '=' && char_after_next == Some('>');
1983
1984 // When whitespace precedes the delimiter, only unambiguous
1985 // delimiters are accepted:
1986 // - Paired delimiters ({, [, (, <) are always safe.
1987 // - ' and " are safe for all operators EXCEPT `s` — `-s 'filename'`
1988 // is a valid file-size filetest and must not be treated as a
1989 // substitution start. All other operators (qw, q, qq, qr, qx, m,
1990 // tr, y) have no corresponding file-test operator.
1991 // - / is safe for non-substitution quote operators; `qw /a b/` and
1992 // `m /re/` are common, while `s /foo/bar/` remains ambiguous with
1993 // the file-size test shape and stays rejected here.
1994 // - Non-paired, non-quote chars ($, @, ,, etc.) remain rejected.
1995 let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
1996 let is_quote_char = matches!(next, '\'' | '"') && op != "s";
1997 let is_spaced_slash_delim = next == '/' && op != "s";
1998 let is_valid_delim = Self::is_quote_delim(next)
1999 && !is_fat_arrow
2000 && (!has_whitespace
2001 || is_paired_delim
2002 || is_quote_char
2003 || is_spaced_slash_delim);
2004
2005 if is_valid_delim {
2006 self.mode = LexerMode::ExpectDelimiter;
2007 self.current_quote_op = Some(quote_handler::QuoteOperatorInfo {
2008 operator: op.to_string(),
2009 delimiter: '\0', // Will be set when we see the delimiter
2010 start_pos: start,
2011 });
2012
2013 // Don't return a keyword token - continue to parse the delimiter
2014 // Skip any whitespace between operator and delimiter
2015 while let Some(ch) = self.current_char() {
2016 if ch.is_whitespace() {
2017 self.advance();
2018 } else {
2019 break;
2020 }
2021 }
2022
2023 // Get the delimiter
2024 #[allow(clippy::collapsible_if)]
2025 if let Some(delim) = self.current_char() {
2026 if !delim.is_alphanumeric() {
2027 self.advance();
2028 if let Some(ref mut info) = self.current_quote_op {
2029 info.delimiter = delim;
2030 }
2031 // Parse the quote operator content and return the complete token
2032 return self.parse_quote_operator(delim);
2033 }
2034 }
2035 } else {
2036 // Not a quote operator here → treat as IDENTIFIER
2037 self.current_quote_op = None;
2038 self.mode = LexerMode::ExpectOperator;
2039 return Some(Token {
2040 token_type: TokenType::Identifier(Arc::from(text)),
2041 start,
2042 end: self.position,
2043 text: Arc::from(text),
2044 });
2045 }
2046 } else {
2047 // End-of-input after the word → also treat as IDENTIFIER
2048 self.current_quote_op = None;
2049 self.mode = LexerMode::ExpectOperator;
2050 return Some(Token {
2051 token_type: TokenType::Identifier(Arc::from(text)),
2052 start,
2053 end: self.position,
2054 text: Arc::from(text),
2055 });
2056 }
2057 // If we get here but haven't returned, something went wrong
2058 // Fall through to treat as identifier
2059 self.current_quote_op = None;
2060 self.mode = LexerMode::ExpectOperator;
2061 return Some(Token {
2062 token_type: TokenType::Identifier(Arc::from(text)),
2063 start,
2064 end: self.position,
2065 text: Arc::from(text),
2066 });
2067 }
2068 // Format declarations need special handling
2069 "format" => {
2070 // We'll need to check for the = after the format name
2071 // For now, just mark that we saw format
2072 }
2073 _ if is_builtin_function(text) => {
2074 // Bare builtins are term-introducing in Perl.
2075 self.mode = LexerMode::ExpectTerm;
2076 }
2077 _ => {
2078 self.mode = LexerMode::ExpectOperator;
2079 }
2080 }
2081 TokenType::Keyword(Arc::from(text))
2082 } else {
2083 // Mirror parser bare-builtin handling so `/` after builtins like
2084 // `join` or `print` is lexed as a regex term, not division.
2085 if is_builtin_function(text) {
2086 self.mode = LexerMode::ExpectTerm;
2087 } else {
2088 self.mode = LexerMode::ExpectOperator;
2089 }
2090 TokenType::Identifier(Arc::from(text))
2091 };
2092
2093 self.after_arrow = false;
2094 // A keyword/identifier is not a variable; `{` after it is a block opener.
2095 self.after_var_subscript = false;
2096 // hash_brace_depth is managed by { and } handlers, not cleared per-token
2097 Some(Token { token_type, text: Arc::from(text), start, end: self.position })
2098 } else {
2099 None
2100 }
2101 }
2102
2103 /// Parse data section body - consumes everything to EOF
2104 fn parse_data_body(&mut self) -> Option<Token> {
2105 if self.position >= self.input.len() {
2106 // Already at EOF
2107 self.mode = LexerMode::ExpectTerm;
2108 return Some(Token {
2109 token_type: TokenType::EOF,
2110 text: Arc::from(""),
2111 start: self.position,
2112 end: self.position,
2113 });
2114 }
2115
2116 let start = self.position;
2117 // Consume everything to EOF
2118 let body = &self.input[self.position..];
2119 self.position = self.input.len();
2120
2121 // Reset mode for next parse (though we're at EOF)
2122 self.mode = LexerMode::ExpectTerm;
2123
2124 Some(Token {
2125 token_type: TokenType::DataBody(Arc::from(body)),
2126 text: Arc::from(body),
2127 start,
2128 end: self.position,
2129 })
2130 }
2131
2132 /// Parse format body - consumes until a line with just a dot
2133 fn parse_format_body(&mut self) -> Option<Token> {
2134 let start = self.position;
2135 let mut body = String::new();
2136 let mut line_start = true;
2137
2138 while self.position < self.input.len() {
2139 // Check if we're at the start of a line and the next char is a dot
2140 if line_start && self.current_char() == Some('.') {
2141 // Check if this line contains only a dot
2142 let mut peek_pos = self.position + 1;
2143 let mut found_terminator = true;
2144
2145 // Skip any trailing whitespace on the dot line
2146 while peek_pos < self.input.len() {
2147 match self.input_bytes[peek_pos] {
2148 b' ' | b'\t' | b'\r' => peek_pos += 1,
2149 b'\n' => break,
2150 _ => {
2151 found_terminator = false;
2152 break;
2153 }
2154 }
2155 }
2156
2157 if found_terminator {
2158 // We found the terminating dot, consume it
2159 self.position = peek_pos;
2160 if self.position < self.input.len() && self.input_bytes[self.position] == b'\n'
2161 {
2162 self.position += 1;
2163 }
2164
2165 // Switch back to normal mode
2166 self.mode = LexerMode::ExpectTerm;
2167
2168 return Some(Token {
2169 token_type: TokenType::FormatBody(Arc::from(body.clone())),
2170 text: Arc::from(body),
2171 start,
2172 end: self.position,
2173 });
2174 }
2175 }
2176
2177 // Not a terminator, consume the character
2178 match self.current_char() {
2179 Some(ch) => {
2180 body.push(ch);
2181 self.advance();
2182
2183 // Track if we're at the start of a line
2184 line_start = ch == '\n';
2185 }
2186 None => {
2187 // Reached EOF without finding terminator
2188 break;
2189 }
2190 }
2191 }
2192
2193 // If we reach here, we didn't find a terminator
2194 self.mode = LexerMode::ExpectTerm;
2195 Some(Token {
2196 token_type: TokenType::Error(Arc::from("Unterminated format body")),
2197 text: Arc::from(body),
2198 start,
2199 end: self.position,
2200 })
2201 }
2202
2203 fn try_operator(&mut self) -> Option<Token> {
2204 // Skip operator parsing if we're expecting a delimiter for a quote operator
2205 if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2206 return None;
2207 }
2208
2209 let start = self.position;
2210 let ch = self.current_char()?;
2211
2212 // ═══════════════════════════════════════════════════════════════════════
2213 // SLASH DISAMBIGUATION STRATEGY (Issue #422)
2214 // ═══════════════════════════════════════════════════════════════════════
2215 //
2216 // Perl's `/` character is ambiguous:
2217 // - Division operator: `$x / 2`
2218 // - Regex delimiter: `/pattern/`
2219 // - Defined-or operator: `$x // $y`
2220 //
2221 // **Disambiguation Strategy (Context-Aware Heuristics):**
2222 //
2223 // 1. **Mode-Based Decision (Primary)**:
2224 // - `LexerMode::ExpectTerm` → `/` starts a regex
2225 // Examples: `if (/pattern/)`, `=~ /test/`, `( /regex/`
2226 // - `LexerMode::ExpectOperator` → `/` is division or `//`
2227 // Examples: `$x / 2`, `$x // $y`, `) / 3`
2228 //
2229 // 2. **Context Heuristics (Secondary - Implicit in Mode)**:
2230 // Mode is set based on previous token:
2231 // - After identifier/number/closing paren → ExpectOperator → division
2232 // - After operator/keyword/opening paren → ExpectTerm → regex
2233 //
2234 // 3. **Budget Protection**:
2235 // - Regex parsing has a parse-step budget and byte budget
2236 // - Budget exceeded → emit UnknownRest token (graceful degradation)
2237 // - See `parse_regex()` and `budget_guard()` for implementation
2238 //
2239 // 4. **Performance Characteristics**:
2240 // - Single-pass: O(1) decision based on mode flag
2241 // - No backtracking: Mode updated after each token
2242 // - Optimized: Byte-level operations for common cases
2243 //
2244 // **Metrics & Monitoring**:
2245 // - Budget exceeded events tracked via UnknownRest token emission
2246 // - LSP diagnostics generated for truncated regexes
2247 // - Test coverage: lexer_slash_timeout_tests.rs (21 test cases)
2248 //
2249 // ═══════════════════════════════════════════════════════════════════════
2250
2251 if ch == '/' {
2252 if self.mode == LexerMode::ExpectTerm {
2253 // Mode indicates we're expecting a term → `/` starts a regex
2254 // Examples: `if (/pattern/)`, `=~ /test/`, `while (/match/)`
2255 return self.parse_regex(start);
2256 } else {
2257 // Mode indicates we're expecting an operator → `/` is division or `//`
2258 // Examples: `$x / 2`, `$x // $y`, `10 / 3`
2259 self.advance();
2260 // Check for // or //= using byte-level operations for speed
2261 if self.peek_byte(0) == Some(b'/') {
2262 self.position += 1; // consume second / directly
2263 if self.peek_byte(0) == Some(b'=') {
2264 self.position += 1; // consume = directly
2265 let text = &self.input[start..self.position];
2266 self.mode = LexerMode::ExpectTerm;
2267 return Some(Token {
2268 token_type: TokenType::Operator(Arc::from(text)),
2269 text: Arc::from(text),
2270 start,
2271 end: self.position,
2272 });
2273 } else {
2274 // Use cached string for common "//" operator
2275 self.mode = LexerMode::ExpectTerm;
2276 return Some(Token {
2277 token_type: TokenType::Operator(Arc::from("//")),
2278 text: Arc::from("//"),
2279 start,
2280 end: self.position,
2281 });
2282 }
2283 } else if self.position < self.input_bytes.len()
2284 && self.input_bytes[self.position] == b'='
2285 {
2286 // /= division-assign operator
2287 self.position += 1; // consume =
2288 self.mode = LexerMode::ExpectTerm;
2289 return Some(Token {
2290 token_type: TokenType::Operator(Arc::from("/=")),
2291 text: Arc::from("/="),
2292 start,
2293 end: self.position,
2294 });
2295 } else {
2296 // Use cached string for common "/" division
2297 self.mode = LexerMode::ExpectTerm;
2298 return Some(Token {
2299 token_type: TokenType::Division,
2300 text: Arc::from("/"),
2301 start,
2302 end: self.position,
2303 });
2304 }
2305 }
2306 }
2307
2308 // Handle other operators - simplified
2309 match ch {
2310 '.' => {
2311 // Check if it's a decimal number like .5 -- but only when we
2312 // expect a term. In operator position `.5` is concatenation
2313 // of the bareword/number on the left with the number `5`.
2314 if self.mode != LexerMode::ExpectOperator
2315 && self.peek_char(1).is_some_and(|c| c.is_ascii_digit())
2316 {
2317 return self.parse_decimal_number(start);
2318 }
2319 self.advance();
2320 // Check for compound operators
2321 #[allow(clippy::collapsible_if)]
2322 if let Some(next) = self.current_char() {
2323 if is_compound_operator(ch, next) {
2324 self.advance();
2325
2326 // Check for three-character operators like **=, <<=, >>=
2327 if self.position < self.input.len() {
2328 let third = self.current_char();
2329 // Check for three-character operators
2330 if matches!(
2331 (ch, next, third),
2332 ('*', '*', Some('='))
2333 | ('<', '<', Some('='))
2334 | ('>', '>', Some('='))
2335 | ('&', '&', Some('='))
2336 | ('|', '|', Some('='))
2337 | ('/', '/', Some('='))
2338 ) {
2339 self.advance(); // consume the =
2340 } else if ch == '<' && next == '=' && third == Some('>') {
2341 self.advance(); // consume the >
2342 // Special case: <=> spaceship operator
2343 } else if ch == '.' && next == '.' && third == Some('.') {
2344 self.advance(); // consume the third .
2345 }
2346 }
2347 }
2348 }
2349 }
2350 '+' | '-' | '*' | '%' | '&' | '|' | '^' | '~' | '!' | '=' | '<' | '>' | ':' | '?'
2351 | '\\' => {
2352 self.advance();
2353 // Check for compound operators
2354 #[allow(clippy::collapsible_if)]
2355 if let Some(next) = self.current_char() {
2356 if is_compound_operator(ch, next) {
2357 self.advance();
2358
2359 // Check for three-character operators like **=, <<=, >>=
2360 if self.position < self.input.len() {
2361 let third = self.current_char();
2362 // Check for three-character operators
2363 if matches!(
2364 (ch, next, third),
2365 ('*', '*', Some('='))
2366 | ('<', '<', Some('='))
2367 | ('>', '>', Some('='))
2368 | ('&', '&', Some('='))
2369 | ('|', '|', Some('='))
2370 | ('/', '/', Some('='))
2371 ) {
2372 self.advance(); // consume the =
2373 } else if ch == '<' && next == '=' && third == Some('>') {
2374 self.advance(); // consume the >
2375 // Special case: <=> spaceship operator
2376 }
2377 }
2378 }
2379 }
2380 }
2381 _ => return None,
2382 }
2383
2384 let text = &self.input[start..self.position];
2385 // Operator ends prototype window (e.g. `:` for attributes)
2386 self.after_sub = false;
2387 // Track whether this operator is '->' for method name disambiguation
2388 self.after_arrow = text == "->";
2389 // Any operator token ends the "just saw a variable" window; `{` after
2390 // an operator is not a hash subscript (e.g. `foo() {`, `+ {`, etc.).
2391 self.after_var_subscript = false;
2392 // Postfix ++ and -- complete a term expression, so next token is an operator
2393 // (e.g., "$x++ / 2" → / is division, not regex)
2394 if (text == "++" || text == "--") && self.mode == LexerMode::ExpectOperator {
2395 // Postfix: stay in ExpectOperator
2396 } else {
2397 self.mode = LexerMode::ExpectTerm;
2398 }
2399
2400 Some(Token {
2401 token_type: TokenType::Operator(Arc::from(text)),
2402 text: Arc::from(text),
2403 start,
2404 end: self.position,
2405 })
2406 }
2407
2408 fn try_delimiter(&mut self) -> Option<Token> {
2409 let start = self.position;
2410 let ch = self.current_char()?;
2411
2412 // If we're expecting a delimiter for a quote operator, handle it specially
2413 if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2414 // Accept any non-alphanumeric character as a delimiter
2415 if !ch.is_alphanumeric() && !ch.is_whitespace() {
2416 self.advance();
2417 if let Some(ref mut info) = self.current_quote_op {
2418 info.delimiter = ch;
2419 }
2420 // Now parse the quote operator content
2421 return self.parse_quote_operator(ch);
2422 }
2423 }
2424
2425 match ch {
2426 '(' => {
2427 // Check if this is a quote operator delimiter
2428 if matches!(self.mode, LexerMode::ExpectDelimiter)
2429 && self.current_quote_op.is_some()
2430 {
2431 self.advance();
2432 if let Some(ref mut info) = self.current_quote_op {
2433 info.delimiter = ch;
2434 }
2435 return self.parse_quote_operator(ch);
2436 }
2437
2438 self.advance();
2439 if self.after_sub {
2440 // Promote after_sub to in_prototype now that we see '('
2441 self.in_prototype = true;
2442 self.after_sub = false;
2443 self.prototype_depth = 1;
2444 } else if self.in_prototype {
2445 self.prototype_depth += 1;
2446 }
2447 self.paren_depth += 1;
2448 self.after_var_subscript = false;
2449 self.mode = LexerMode::ExpectTerm;
2450 Some(Token {
2451 token_type: TokenType::LeftParen,
2452 text: Arc::from("("),
2453 start,
2454 end: self.position,
2455 })
2456 }
2457 ')' => {
2458 self.advance();
2459 if self.in_prototype && self.prototype_depth > 0 {
2460 self.prototype_depth -= 1;
2461 if self.prototype_depth == 0 {
2462 self.in_prototype = false;
2463 }
2464 }
2465 self.after_arrow = false;
2466 self.paren_depth = self.paren_depth.saturating_sub(1);
2467 // A closing paren ends any var-subscript context: `if ($var)` should
2468 // NOT leave after_var_subscript set, otherwise the following `{` would
2469 // incorrectly increment hash_brace_depth and suppress regex operators
2470 // inside the block body (issue #2844).
2471 self.after_var_subscript = false;
2472 self.mode = LexerMode::ExpectOperator;
2473 Some(Token {
2474 token_type: TokenType::RightParen,
2475 text: Arc::from(")"),
2476 start,
2477 end: self.position,
2478 })
2479 }
2480 ';' => {
2481 self.advance();
2482 // Semicolon ends prototype window (forward declaration)
2483 self.after_sub = false;
2484 // Semicolon is a statement boundary — any pending method-call chain is over.
2485 self.after_arrow = false;
2486 self.after_var_subscript = false;
2487 self.mode = LexerMode::ExpectTerm;
2488 Some(Token {
2489 token_type: TokenType::Semicolon,
2490 text: Arc::from(";"),
2491 start,
2492 end: self.position,
2493 })
2494 }
2495 ',' => {
2496 self.advance();
2497 self.after_var_subscript = false;
2498 self.mode = LexerMode::ExpectTerm;
2499 Some(Token {
2500 token_type: TokenType::Comma,
2501 text: Arc::from(","),
2502 start,
2503 end: self.position,
2504 })
2505 }
2506 '[' => {
2507 self.advance();
2508 self.after_var_subscript = false;
2509 self.mode = LexerMode::ExpectTerm;
2510 Some(Token {
2511 token_type: TokenType::LeftBracket,
2512 text: Arc::from("["),
2513 start,
2514 end: self.position,
2515 })
2516 }
2517 ']' => {
2518 self.advance();
2519 // A closing `]` from an array subscript leaves us in a state where
2520 // a `{` immediately following is a hash subscript — e.g. `$arr[$i]{key}`.
2521 // Set after_var_subscript so the `{` handler recognises it as such.
2522 // This mirrors the `}` handler's behavior when closing a hash subscript.
2523 self.after_var_subscript = true;
2524 self.mode = LexerMode::ExpectOperator;
2525 Some(Token {
2526 token_type: TokenType::RightBracket,
2527 text: Arc::from("]"),
2528 start,
2529 end: self.position,
2530 })
2531 }
2532 '{' => {
2533 self.advance();
2534 // Opening brace ends prototype window — no prototype follows
2535 self.after_sub = false;
2536 // `{` is a hash/slice subscript opener only when it immediately follows
2537 // a variable token ($x, @x, %x) — tracked by `after_var_subscript`.
2538 // This is narrower than the old `mode == ExpectOperator` check, which
2539 // incorrectly incremented depth for block-opening braces after `sub foo`,
2540 // `if (cond)`, `else`, `while (cond)`, etc., causing quote-op suppression
2541 // inside those block bodies and breaking m//, s///, qr//, tr/// etc.
2542 if self.after_var_subscript {
2543 self.hash_brace_depth = self.hash_brace_depth.saturating_add(1);
2544 }
2545 self.after_var_subscript = false;
2546 self.mode = LexerMode::ExpectTerm;
2547 Some(Token {
2548 token_type: TokenType::LeftBrace,
2549 text: Arc::from("{"),
2550 start,
2551 end: self.position,
2552 })
2553 }
2554 '}' => {
2555 self.advance();
2556 self.after_arrow = false;
2557 // Decrement hash subscript brace depth only if we were inside one.
2558 // If depth > 0, this closes a hash subscript; enable chained subscripts
2559 // like $h{a}{b} by setting after_var_subscript so the next `{` is
2560 // recognized as another subscript opener.
2561 if self.hash_brace_depth > 0 {
2562 self.hash_brace_depth -= 1;
2563 // The subscript value is now the "variable" for a chained subscript.
2564 self.after_var_subscript = true;
2565 } else {
2566 // Block-close `}` — no subscript follows
2567 self.after_var_subscript = false;
2568 }
2569 self.mode = LexerMode::ExpectOperator;
2570 Some(Token {
2571 token_type: TokenType::RightBrace,
2572 text: Arc::from("}"),
2573 start,
2574 end: self.position,
2575 })
2576 }
2577 '#' => {
2578 // Only treat as delimiter in ExpectDelimiter mode
2579 if matches!(self.mode, LexerMode::ExpectDelimiter) {
2580 self.advance();
2581 // Reset mode after consuming delimiter
2582 self.mode = LexerMode::ExpectTerm;
2583 Some(Token {
2584 token_type: TokenType::Operator(Arc::from("#")),
2585 text: Arc::from("#"),
2586 start,
2587 end: self.position,
2588 })
2589 } else {
2590 None
2591 }
2592 }
2593 _ => None,
2594 }
2595 }
2596
2597 fn parse_double_quoted_string(&mut self, start: usize) -> Option<Token> {
2598 self.advance(); // Skip opening quote
2599 let mut parts = Vec::new();
2600 let mut current_literal = String::new();
2601 let mut last_pos = self.position;
2602
2603 while let Some(ch) = self.current_char() {
2604 match ch {
2605 '"' => {
2606 self.advance();
2607 if !current_literal.is_empty() {
2608 parts.push(StringPart::Literal(Arc::from(current_literal)));
2609 }
2610
2611 let text = &self.input[start..self.position];
2612 self.mode = LexerMode::ExpectOperator;
2613
2614 return Some(Token {
2615 token_type: if parts.is_empty() {
2616 TokenType::StringLiteral
2617 } else {
2618 TokenType::InterpolatedString(parts)
2619 },
2620 text: Arc::from(text),
2621 start,
2622 end: self.position,
2623 });
2624 }
2625 '\\' => {
2626 self.advance();
2627 if let Some(escaped) = self.current_char() {
2628 // Optimize by reserving space to avoid frequent reallocations
2629 if current_literal.capacity() == 0 {
2630 current_literal.reserve(32);
2631 }
2632 current_literal.push('\\');
2633 current_literal.push(escaped);
2634 self.advance();
2635 }
2636 }
2637 '$' if self.config.parse_interpolation => {
2638 // Handle variable interpolation - avoid unnecessary clone
2639 if !current_literal.is_empty() {
2640 parts.push(StringPart::Literal(Arc::from(current_literal)));
2641 current_literal = String::new(); // Clear without cloning
2642 }
2643
2644 let part_start = self.position;
2645 self.advance();
2646 match self.current_char() {
2647 Some('{') => {
2648 let _ = self.consume_balanced_segment_in_string('{', '}', '"');
2649 parts.push(StringPart::Expression(Arc::from(
2650 &self.input[part_start..self.position],
2651 )));
2652 }
2653 Some(ch) if is_perl_identifier_start(ch) => {
2654 let var_start = self.position;
2655
2656 // Fast path for ASCII identifier continuation
2657 while self.position < self.input_bytes.len() {
2658 let byte = self.input_bytes[self.position];
2659 if byte.is_ascii_alphanumeric() || byte == b'_' {
2660 self.position += 1;
2661 } else if byte >= 128 {
2662 // Only use UTF-8 parsing for non-ASCII
2663 if let Some(ch) = self.current_char() {
2664 if is_perl_identifier_continue(ch) {
2665 self.advance();
2666 } else {
2667 break;
2668 }
2669 } else {
2670 break;
2671 }
2672 } else {
2673 break;
2674 }
2675 }
2676
2677 if self.position > var_start {
2678 let var_name = &self.input[part_start..self.position];
2679 parts.push(StringPart::Variable(Arc::from(var_name)));
2680
2681 if self.matches_bytes(b"->") {
2682 let tail_start = self.position;
2683 self.advance();
2684 self.advance();
2685
2686 match self.current_char() {
2687 Some('[') => {
2688 let _ = self
2689 .consume_balanced_segment_in_string('[', ']', '"');
2690 parts.push(StringPart::MethodCall(Arc::from(
2691 &self.input[tail_start..self.position],
2692 )));
2693 }
2694 Some('{') => {
2695 let _ = self
2696 .consume_balanced_segment_in_string('{', '}', '"');
2697 parts.push(StringPart::MethodCall(Arc::from(
2698 &self.input[tail_start..self.position],
2699 )));
2700 }
2701 Some('(') => {
2702 let _ = self
2703 .consume_balanced_segment_in_string('(', ')', '"');
2704 parts.push(StringPart::MethodCall(Arc::from(
2705 &self.input[tail_start..self.position],
2706 )));
2707 }
2708 Some(ch) if is_perl_identifier_start(ch) => {
2709 while self.position < self.input_bytes.len() {
2710 let byte = self.input_bytes[self.position];
2711 if byte.is_ascii_alphanumeric() || byte == b'_' {
2712 self.position += 1;
2713 } else if byte >= 128 {
2714 if let Some(ch) = self.current_char() {
2715 if is_perl_identifier_continue(ch) {
2716 self.advance();
2717 } else {
2718 break;
2719 }
2720 } else {
2721 break;
2722 }
2723 } else {
2724 break;
2725 }
2726 }
2727 if self.current_char() == Some('(') {
2728 let _ = self.consume_balanced_segment_in_string(
2729 '(', ')', '"',
2730 );
2731 }
2732 parts.push(StringPart::MethodCall(Arc::from(
2733 &self.input[tail_start..self.position],
2734 )));
2735 }
2736 _ => {
2737 parts.push(StringPart::MethodCall(Arc::from(
2738 &self.input[tail_start..self.position],
2739 )));
2740 }
2741 }
2742 } else if self.current_char() == Some('[') {
2743 let tail_start = self.position;
2744 let _ = self.consume_balanced_segment_in_string('[', ']', '"');
2745 parts.push(StringPart::ArraySlice(Arc::from(
2746 &self.input[tail_start..self.position],
2747 )));
2748 } else if self.current_char() == Some('{') {
2749 let tail_start = self.position;
2750 let _ = self.consume_balanced_segment_in_string('{', '}', '"');
2751 parts.push(StringPart::Expression(Arc::from(
2752 &self.input[tail_start..self.position],
2753 )));
2754 }
2755 }
2756 }
2757 _ => {}
2758 }
2759 }
2760 _ => {
2761 // Optimize string building with better capacity management
2762 if current_literal.capacity() == 0 {
2763 current_literal.reserve(32);
2764 }
2765 current_literal.push(ch);
2766 self.advance();
2767 }
2768 }
2769
2770 // Safety check: ensure we're making progress
2771 if self.position == last_pos {
2772 break;
2773 }
2774 last_pos = self.position;
2775 }
2776
2777 Some(self.unterminated_string_error(start))
2778 }
2779
2780 fn parse_single_quoted_string(&mut self, start: usize) -> Option<Token> {
2781 self.advance(); // Skip opening quote
2782
2783 let mut last_pos = self.position;
2784
2785 while let Some(ch) = self.current_char() {
2786 match ch {
2787 '\'' => {
2788 self.advance();
2789 let text = &self.input[start..self.position];
2790 self.mode = LexerMode::ExpectOperator;
2791
2792 return Some(Token {
2793 token_type: TokenType::StringLiteral,
2794 text: Arc::from(text),
2795 start,
2796 end: self.position,
2797 });
2798 }
2799 '\\' => {
2800 self.advance();
2801 if self.current_char() == Some('\'') || self.current_char() == Some('\\') {
2802 self.advance();
2803 }
2804 }
2805 _ => self.advance(),
2806 }
2807
2808 // Safety check: ensure we're making progress
2809 if self.position == last_pos {
2810 break;
2811 }
2812 last_pos = self.position;
2813 }
2814
2815 Some(self.unterminated_string_error(start))
2816 }
2817
2818 fn parse_backtick_string(&mut self, start: usize) -> Option<Token> {
2819 self.advance(); // Skip opening backtick
2820
2821 let mut last_pos = self.position;
2822
2823 while let Some(ch) = self.current_char() {
2824 match ch {
2825 '`' => {
2826 self.advance();
2827 let text = &self.input[start..self.position];
2828 self.mode = LexerMode::ExpectOperator;
2829
2830 return Some(Token {
2831 token_type: TokenType::QuoteCommand,
2832 text: Arc::from(text),
2833 start,
2834 end: self.position,
2835 });
2836 }
2837 '\\' => {
2838 self.advance();
2839 if self.current_char().is_some() {
2840 self.advance();
2841 }
2842 }
2843 _ => self.advance(),
2844 }
2845
2846 // Safety check: ensure we're making progress
2847 if self.position == last_pos {
2848 break;
2849 }
2850 last_pos = self.position;
2851 }
2852
2853 Some(self.unterminated_string_error(start))
2854 }
2855
2856 fn parse_q_string(&mut self, _start: usize) -> Option<Token> {
2857 // Simplified q-string parsing
2858 None
2859 }
2860
2861 #[inline]
2862 fn unterminated_string_error(&mut self, start: usize) -> Token {
2863 // Consume to EOF so the caller receives a single terminal error token.
2864 let end = self.input.len();
2865 self.position = end;
2866
2867 Token {
2868 token_type: TokenType::Error(Arc::from("unterminated string")),
2869 text: Arc::from(&self.input[start..end]),
2870 start,
2871 end,
2872 }
2873 }
2874
2875 fn parse_substitution(&mut self, start: usize) -> Option<Token> {
2876 // We've already consumed 's'
2877 let delimiter = self.current_char()?;
2878 self.advance(); // Skip delimiter
2879 self.parse_substitution_with_delimiter(start, delimiter)
2880 }
2881
2882 fn parse_substitution_with_delimiter(
2883 &mut self,
2884 start: usize,
2885 delimiter: char,
2886 ) -> Option<Token> {
2887 let (_pattern, pattern_closed) = self.read_delimited_body(delimiter);
2888 let replacement_closed;
2889
2890 let pattern_is_paired = quote_handler::paired_close(delimiter).is_some();
2891 if pattern_is_paired {
2892 self.skip_paired_substitution_replacement_gap();
2893
2894 if let Some(repl_delim) = self.current_char()
2895 && Self::is_quote_delim(repl_delim)
2896 {
2897 self.advance();
2898 let (_replacement, closed) = self.read_substitution_replacement_body(repl_delim);
2899 replacement_closed = closed;
2900 } else {
2901 replacement_closed = false;
2902 }
2903 } else {
2904 let (_replacement, closed) = self.read_substitution_replacement_body(delimiter);
2905 replacement_closed = closed;
2906 }
2907
2908 // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
2909 while let Some(ch) = self.current_char() {
2910 if ch.is_ascii_alphanumeric() {
2911 self.advance();
2912 } else {
2913 break;
2914 }
2915 }
2916
2917 let text = &self.input[start..self.position];
2918 self.mode = LexerMode::ExpectOperator;
2919
2920 let token_type = if pattern_closed && replacement_closed {
2921 TokenType::Substitution
2922 } else {
2923 TokenType::Error(Arc::from(format!(
2924 "unclosed quote-like operator 's' delimiter '{}'",
2925 delimiter
2926 )))
2927 };
2928
2929 Some(Token { token_type, text: Arc::from(text), start, end: self.position })
2930 }
2931
2932 fn skip_paired_substitution_replacement_gap(&mut self) {
2933 let mut comment_eligible = false;
2934 loop {
2935 let mut saw_whitespace = false;
2936 while self.current_char().is_some_and(char::is_whitespace) {
2937 self.advance();
2938 saw_whitespace = true;
2939 }
2940 comment_eligible |= saw_whitespace;
2941
2942 if comment_eligible && self.current_char() == Some('#') {
2943 while let Some(ch) = self.current_char() {
2944 self.advance();
2945 if matches!(ch, '\n' | '\r') {
2946 break;
2947 }
2948 }
2949 comment_eligible = true;
2950 continue;
2951 }
2952
2953 break;
2954 }
2955 }
2956
2957 fn read_substitution_replacement_body(&mut self, delim: char) -> (String, bool) {
2958 if quote_handler::paired_close(delim).is_some() {
2959 return self.read_delimited_body(delim);
2960 }
2961
2962 self.read_unpaired_substitution_replacement_body(delim)
2963 }
2964
2965 fn read_unpaired_substitution_replacement_body(&mut self, delim: char) -> (String, bool) {
2966 let mut body = String::new();
2967 let mut escaped = false;
2968
2969 while let Some(ch) = self.current_char() {
2970 if escaped {
2971 body.push(ch);
2972 self.advance();
2973 escaped = false;
2974 continue;
2975 }
2976
2977 match ch {
2978 '\\' => {
2979 body.push(ch);
2980 self.advance();
2981 escaped = true;
2982 }
2983 '"' | '\'' if ch != delim => {
2984 if let Some((string_end, true)) =
2985 self.scan_inner_string_for_delimiter(self.position, ch, delim)
2986 {
2987 if let Some(string_text) = self.input.get(self.position..string_end) {
2988 body.push_str(string_text);
2989 self.position = string_end;
2990 } else {
2991 body.push(ch);
2992 self.advance();
2993 }
2994 } else {
2995 body.push(ch);
2996 self.advance();
2997 }
2998 }
2999 c if c == delim => {
3000 self.advance();
3001 return (body, true);
3002 }
3003 _ => {
3004 body.push(ch);
3005 self.advance();
3006 }
3007 }
3008 }
3009
3010 (body, false)
3011 }
3012
3013 fn scan_inner_string_for_delimiter(
3014 &self,
3015 start: usize,
3016 quote: char,
3017 delim: char,
3018 ) -> Option<(usize, bool)> {
3019 let mut pos = start.checked_add(quote.len_utf8())?;
3020 let mut escaped = false;
3021 let mut contains_delim = false;
3022
3023 while let Some(ch) = self.input.get(pos..).and_then(|text| text.chars().next()) {
3024 if matches!(ch, '\n' | '\r') {
3025 return None;
3026 }
3027
3028 if escaped {
3029 if ch == delim {
3030 contains_delim = true;
3031 }
3032 pos += ch.len_utf8();
3033 escaped = false;
3034 continue;
3035 }
3036
3037 match ch {
3038 '\\' => {
3039 pos += ch.len_utf8();
3040 escaped = true;
3041 }
3042 c if c == quote => {
3043 return Some((pos + ch.len_utf8(), contains_delim));
3044 }
3045 c if c == delim => {
3046 contains_delim = true;
3047 pos += ch.len_utf8();
3048 }
3049 _ => {
3050 pos += ch.len_utf8();
3051 }
3052 }
3053 }
3054
3055 None
3056 }
3057
3058 fn parse_transliteration(&mut self, start: usize) -> Option<Token> {
3059 // We've already consumed 'tr' or 'y'
3060 while self.current_char().is_some_and(char::is_whitespace) {
3061 self.advance();
3062 }
3063
3064 let delimiter = self.current_char()?;
3065 self.advance(); // Skip delimiter
3066 self.parse_transliteration_with_delimiter(start, delimiter)
3067 }
3068
3069 fn parse_transliteration_with_delimiter(
3070 &mut self,
3071 start: usize,
3072 delimiter: char,
3073 ) -> Option<Token> {
3074 let (_search, search_closed) = self.read_delimited_body(delimiter);
3075 let replacement_closed;
3076
3077 let search_is_paired = quote_handler::paired_close(delimiter).is_some();
3078 if search_is_paired {
3079 while self.current_char().is_some_and(char::is_whitespace) {
3080 self.advance();
3081 }
3082
3083 if let Some(repl_delim) = self.current_char()
3084 && Self::is_quote_delim(repl_delim)
3085 {
3086 self.advance();
3087 let (_replacement, closed) = self.read_delimited_body(repl_delim);
3088 replacement_closed = closed;
3089 } else {
3090 replacement_closed = false;
3091 }
3092 } else {
3093 let (_replacement, closed) = self.read_delimited_body(delimiter);
3094 replacement_closed = closed;
3095 }
3096
3097 // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
3098 while let Some(ch) = self.current_char() {
3099 if ch.is_ascii_alphanumeric() {
3100 self.advance();
3101 } else {
3102 break;
3103 }
3104 }
3105
3106 let text = &self.input[start..self.position];
3107 self.mode = LexerMode::ExpectOperator;
3108
3109 let token_type = if search_closed && replacement_closed {
3110 TokenType::Transliteration
3111 } else {
3112 TokenType::Error(Arc::from(format!(
3113 "unclosed quote-like operator '{}' delimiter '{}'",
3114 if self.input[start..].starts_with("tr") { "tr" } else { "y" },
3115 delimiter
3116 )))
3117 };
3118
3119 Some(Token { token_type, text: Arc::from(text), start, end: self.position })
3120 }
3121
3122 /// Read content between delimiters.
3123 ///
3124 /// Returns `(body, closed)` where `closed` is `true` if the closing
3125 /// delimiter was found before EOF, and `false` if EOF was reached first.
3126 fn read_delimited_body(&mut self, delim: char) -> (String, bool) {
3127 let paired = quote_handler::paired_close(delim);
3128 let close = paired.unwrap_or(delim);
3129 let mut body = String::new();
3130 let mut depth = i32::from(paired.is_some());
3131
3132 while let Some(ch) = self.current_char() {
3133 if ch == '\\' {
3134 body.push(ch);
3135 self.advance();
3136 if let Some(next) = self.current_char() {
3137 body.push(next);
3138 self.advance();
3139 }
3140 continue;
3141 }
3142
3143 if paired.is_some() && ch == delim {
3144 body.push(ch);
3145 self.advance();
3146 depth += 1;
3147 continue;
3148 }
3149
3150 if ch == close {
3151 if paired.is_some() {
3152 depth -= 1;
3153 if depth == 0 {
3154 self.advance();
3155 return (body, true);
3156 }
3157 body.push(ch);
3158 self.advance();
3159 } else {
3160 self.advance();
3161 return (body, true);
3162 }
3163 continue;
3164 }
3165
3166 body.push(ch);
3167 self.advance();
3168 }
3169
3170 // EOF reached without finding the closing delimiter
3171 (body, false)
3172 }
3173
3174 /// Parse a quote operator after we've seen the delimiter
3175 fn parse_quote_operator(&mut self, delimiter: char) -> Option<Token> {
3176 let info = self.current_quote_op.as_ref()?;
3177 let start = info.start_pos;
3178 let operator = info.operator.clone();
3179
3180 // Clear the quote-op context eagerly so any early-return path (s/tr/y delegations
3181 // below) does not leave a stale reference behind. The post-match cleanup at the
3182 // bottom of this function would otherwise be skipped for those operators.
3183 self.current_quote_op = None;
3184
3185 // Parse based on operator type; track whether all delimiters were closed.
3186 let closed = match operator.as_str() {
3187 "s" => {
3188 return self.parse_substitution_with_delimiter(start, delimiter);
3189 }
3190 "tr" | "y" => {
3191 return self.parse_transliteration_with_delimiter(start, delimiter);
3192 }
3193 "qr" => {
3194 let (_pattern, body_closed) = self.read_delimited_body(delimiter);
3195 self.parse_regex_modifiers("e_handler::QR_SPEC);
3196 body_closed
3197 }
3198 "m" => {
3199 let (_pattern, body_closed) = self.read_delimited_body(delimiter);
3200 self.parse_regex_modifiers("e_handler::M_SPEC);
3201 body_closed
3202 }
3203 _ => {
3204 // q, qq, qw, qx - no modifiers
3205 let (_body, body_closed) = self.read_delimited_body(delimiter);
3206 body_closed
3207 }
3208 };
3209
3210 let text = &self.input[start..self.position];
3211
3212 self.mode = LexerMode::ExpectOperator;
3213
3214 if !closed {
3215 // EOF reached before finding the closing delimiter — emit an error
3216 // token so the parser's recovery mechanism records a diagnostic.
3217 return Some(Token {
3218 token_type: TokenType::Error(Arc::from(format!(
3219 "unclosed {} delimiter '{}'",
3220 operator, delimiter
3221 ))),
3222 text: Arc::from(text),
3223 start,
3224 end: self.position,
3225 });
3226 }
3227
3228 let token_type = quote_handler::get_quote_token_type(&operator);
3229 Some(Token { token_type, text: Arc::from(text), start, end: self.position })
3230 }
3231
3232 /// Parse regex modifiers according to the given spec
3233 ///
3234 /// This function includes ALL characters that could be intended as modifiers,
3235 /// including invalid ones. This allows the parser to properly reject invalid
3236 /// modifiers with a clear error message, rather than leaving them as separate
3237 /// tokens that could be confusingly parsed.
3238 fn parse_regex_modifiers(&mut self, _spec: "e_handler::ModSpec) {
3239 // Consume all alphanumeric characters that could be intended as modifiers
3240 // The parser will validate and reject invalid ones
3241 while let Some(ch) = self.current_char() {
3242 if ch.is_ascii_alphanumeric() {
3243 self.advance();
3244 } else {
3245 break;
3246 }
3247 }
3248 // Note: We no longer validate here - the parser will validate and provide
3249 // clear error messages for invalid modifiers (MUT_005 fix)
3250 }
3251
3252 /// Parse a regex literal starting with `/`
3253 ///
3254 /// **Budget Protection (Issue #422)**:
3255 /// - Budget guards prevent runaway scanning on pathological input
3256 /// - `MAX_REGEX_PARSE_STEPS` bounds literal scanning before the byte budget
3257 /// - `MAX_REGEX_BYTES` bounds total bytes consumed in a single regex literal
3258 /// - Graceful degradation: emit UnknownRest token if budget exceeded
3259 ///
3260 /// **Performance**:
3261 /// - Single-pass scanning with escape handling
3262 /// - Budget check per iteration (amortized O(1) via inline fast path)
3263 /// - Typical regex: <10μs, Large regex (64KB): ~1ms
3264 fn parse_regex(&mut self, start: usize) -> Option<Token> {
3265 self.advance(); // Skip opening /
3266
3267 let mut regex_parse_steps: usize = 0;
3268 let mut in_character_class = false;
3269
3270 while let Some(ch) = self.current_char() {
3271 regex_parse_steps += 1;
3272 if regex_parse_steps > MAX_REGEX_PARSE_STEPS {
3273 #[cfg(debug_assertions)]
3274 {
3275 let text = &self.input[start..self.position];
3276 let preview = truncate_preview(text, 50);
3277 tracing::debug!(
3278 limit = MAX_REGEX_PARSE_STEPS,
3279 pattern_preview = %preview,
3280 "Regex parse step budget exceeded"
3281 );
3282 }
3283 self.position = self.input.len();
3284 return Some(Token {
3285 token_type: TokenType::UnknownRest,
3286 text: empty_arc(),
3287 start,
3288 end: self.position,
3289 });
3290 }
3291
3292 // Budget guard: prevent timeout on pathological input (Issue #422)
3293 // If exceeded, returns UnknownRest token for graceful degradation
3294 if let Some(token) = self.budget_guard(start, 0) {
3295 return Some(token);
3296 }
3297
3298 match ch {
3299 '/' if !in_character_class => {
3300 self.advance();
3301 // Parse flags - include all alphanumeric for proper validation in parser (MUT_005 fix)
3302 while let Some(ch) = self.current_char() {
3303 if ch.is_ascii_alphanumeric() {
3304 self.advance();
3305 } else {
3306 break;
3307 }
3308 }
3309
3310 let text = &self.input[start..self.position];
3311 self.mode = LexerMode::ExpectOperator;
3312
3313 return Some(Token {
3314 token_type: TokenType::RegexMatch,
3315 text: Arc::from(text),
3316 start,
3317 end: self.position,
3318 });
3319 }
3320 '\\' => {
3321 // Handle escape sequences: consume backslash + next char
3322 self.advance();
3323 if self.current_char().is_some() {
3324 self.advance();
3325 }
3326 }
3327 '[' => {
3328 in_character_class = true;
3329 self.advance();
3330 }
3331 ']' if in_character_class => {
3332 in_character_class = false;
3333 self.advance();
3334 }
3335 _ => self.advance(),
3336 }
3337 }
3338
3339 // Unterminated regex - EOF reached before closing /
3340 // Parser will emit diagnostic for unterminated literal
3341 None
3342 }
3343}
3344
3345// Pre-allocated empty Arc to avoid repeated allocations
3346static EMPTY_ARC: OnceLock<Arc<str>> = OnceLock::new();
3347
3348#[inline(always)]
3349fn empty_arc() -> Arc<str> {
3350 EMPTY_ARC.get_or_init(|| Arc::from("")).clone()
3351}
3352
3353fn truncate_preview(text: &str, max_chars: usize) -> String {
3354 match text.char_indices().nth(max_chars) {
3355 Some((idx, _)) => format!("{}...", &text[..idx]),
3356 None => text.to_string(),
3357 }
3358}
3359
3360#[inline(always)]
3361fn is_keyword_fast(word: &str) -> bool {
3362 // Fast length-based rejection for most cases.
3363 // Lexer keywords are currently bounded to 1..=9 characters.
3364 matches!(word.len(), 1..=9) && is_lexer_keyword(word)
3365}
3366
3367#[inline]
3368fn is_builtin_function(word: &str) -> bool {
3369 BARE_TERM_BUILTINS.binary_search(&word).is_ok()
3370}
3371
3372#[inline(always)]
3373fn is_quote_op_word_prefix(word: &[u8]) -> bool {
3374 matches!(word, b"m" | b"q" | b"qq" | b"qw" | b"qx" | b"qr")
3375}
3376
3377const BARE_TERM_BUILTINS: &[&str] = &[
3378 "abs", "chomp", "chop", "chr", "close", "defined", "delete", "each", "exists", "hex", "int",
3379 "join", "keys", "lc", "lcfirst", "length", "oct", "open", "ord", "pack", "print", "push",
3380 "read", "ref", "reverse", "rindex", "say", "scalar", "splice", "sprintf", "sqrt", "substr",
3381 "tie", "uc", "ucfirst", "unpack", "unshift", "untie", "values", "write",
3382];
3383
3384/// Fast lookup table for compound operator second characters
3385const COMPOUND_SECOND_CHARS: &[u8] = b"=<>&|+->.~*:";
3386
3387#[inline]
3388fn is_compound_operator(first: char, second: char) -> bool {
3389 // Optimized compound operator lookup using perfect hashing for common cases
3390 // Convert to bytes for faster comparison (most operators are ASCII)
3391 if first.is_ascii() && second.is_ascii() {
3392 let first_byte = first as u8;
3393 let second_byte = second as u8;
3394
3395 if !COMPOUND_SECOND_CHARS.contains(&second_byte) {
3396 return false;
3397 }
3398
3399 // Use lookup table approach for maximum performance
3400 match (first_byte, second_byte) {
3401 // Assignment operators
3402 (b'+' | b'-' | b'*' | b'/' | b'%' | b'&' | b'|' | b'^' | b'.', b'=') => true,
3403
3404 // Comparison operators
3405 (b'<' | b'>' | b'=' | b'!', b'=') => true,
3406
3407 // Pattern operators
3408 (b'=' | b'!', b'~') => true,
3409
3410 // Increment/decrement
3411 (b'+', b'+') | (b'-', b'-') => true,
3412
3413 // Logical operators
3414 (b'&', b'&') | (b'|', b'|') => true,
3415
3416 // Shift operators
3417 (b'<', b'<') | (b'>', b'>') => true,
3418
3419 // Other compound operators
3420 (b'*', b'*')
3421 | (b'/', b'/')
3422 | (b'-' | b'=', b'>')
3423 | (b'.', b'.')
3424 | (b'~', b'~')
3425 | (b':', b':') => true,
3426
3427 _ => false,
3428 }
3429 } else {
3430 // Fallback for non-ASCII (should be rare)
3431 matches!(
3432 (first, second),
3433 ('+' | '-' | '*' | '/' | '%' | '&' | '|' | '^' | '.' | '<' | '>' | '=' | '!', '=')
3434 | ('=' | '!' | '~', '~')
3435 | ('+', '+')
3436 | ('-', '-' | '>')
3437 | ('&', '&')
3438 | ('|', '|')
3439 | ('<', '<')
3440 | ('>' | '=', '>')
3441 | ('*', '*')
3442 | ('/', '/')
3443 | ('.', '.')
3444 | (':', ':')
3445 )
3446 }
3447}
3448
3449// Checkpoint support for incremental parsing
3450
3451mod checkpoint_impl;
3452
3453#[cfg(test)]
3454mod test_format_debug;
3455#[cfg(test)]
3456mod tests;