perl_lexer/lib.rs
1//! Context-aware Perl lexer with mode-based tokenization
2//!
3//! This crate provides a high-performance lexer for Perl that handles the inherently
4//! context-sensitive nature of the language. The lexer uses a mode-tracking system to
5//! correctly disambiguate ambiguous syntax like `/` (division vs. regex) and properly
6//! parse complex constructs like heredocs, quote-like operators, and nested delimiters.
7//!
8//! # Architecture
9//!
10//! The lexer is organized around several key concepts:
11//!
12//! - **Mode Tracking**: [`LexerMode`] tracks whether the parser expects a term or an operator,
13//! enabling correct disambiguation of context-sensitive tokens.
14//! - **Checkpointing**: [`LexerCheckpoint`] and [`Checkpointable`] support incremental parsing
15//! by allowing the lexer state to be saved and restored.
16//! - **Budget Limits**: Protection against pathological input with configurable size limits
17//! for regex patterns, heredoc bodies, and delimiter nesting depth.
18//! - **Position Tracking**: [`Position`] maintains line/column information for error reporting
19//! and LSP integration.
20//! - **Unicode Support**: Full Unicode identifier support following Perl 5.14+ semantics.
21//!
22//! # Usage
23//!
24//! ## Basic Tokenization
25//!
26//! ```rust
27//! use perl_lexer::{PerlLexer, TokenType};
28//!
29//! let mut lexer = PerlLexer::new("my $x = 42;");
30//! let tokens = lexer.collect_tokens();
31//!
32//! // First token is the keyword `my`
33//! assert!(matches!(&tokens[0].token_type, TokenType::Keyword(k) if &**k == "my"));
34//! // Tokens include variables, operators, literals, and EOF
35//! assert!(matches!(&tokens.last().map(|t| &t.token_type), Some(TokenType::EOF)));
36//! ```
37//!
38//! ## Context-Aware Parsing
39//!
40//! The lexer automatically tracks context to disambiguate operators:
41//!
42//! ```rust
43//! use perl_lexer::{PerlLexer, TokenType};
44//!
45//! // Division operator (after a term)
46//! let mut lexer = PerlLexer::new("42 / 2");
47//! // Regex operator (at start of expression)
48//! let mut lexer2 = PerlLexer::new("/pattern/");
49//! ```
50//!
51//! ## Checkpointing for Incremental Parsing
52//!
53//! ```rust,ignore
54//! use perl_lexer::{PerlLexer, Checkpointable};
55//!
56//! let mut lexer = PerlLexer::new("my $x = 1;");
57//! let checkpoint = lexer.checkpoint();
58//!
59//! // Parse some tokens
60//! let _ = lexer.next_token();
61//!
62//! // Restore to checkpoint
63//! lexer.restore(&checkpoint);
64//! ```
65//!
66//! ## Configuration Options
67//!
68//! ```rust
69//! use perl_lexer::{PerlLexer, LexerConfig};
70//!
71//! let config = LexerConfig {
72//! parse_interpolation: true, // Parse string interpolation
73//! track_positions: true, // Track line/column positions
74//! max_lookahead: 1024, // Maximum lookahead for disambiguation
75//! };
76//!
77//! let mut lexer = PerlLexer::with_config("my $x = 1;", config);
78//! ```
79//!
80//! # Context Sensitivity Examples
81//!
82//! Perl's grammar is highly context-sensitive. The lexer handles these cases:
83//!
84//! - **Division vs. Regex**: `/` is division after terms, regex at expression start
85//! - **Modulo vs. Hash Sigil**: `%` is modulo after terms, hash sigil at expression start
86//! - **Glob vs. Exponent**: `**` can be exponentiation or glob pattern start
87//! - **Defined-or vs. Regex**: `//` is defined-or after terms, regex at expression start
88//! - **Heredoc Markers**: `<<` can be left shift, here-doc, or numeric less-than-less-than
89//!
90//! # Budget Limits
91//!
92//! To prevent hangs on pathological input, the lexer enforces these limits:
93//!
94//! - **MAX_REGEX_BYTES**: 64KB maximum for regex patterns
95//! - **MAX_HEREDOC_BYTES**: 256KB maximum for heredoc bodies
96//! - **MAX_DELIM_NEST**: 128 levels maximum nesting depth for delimiters
97//! - **MAX_REGEX_PARSE_STEPS**: 32K maximum scan iterations for regex literals
98//!
99//! When limits are exceeded, the lexer emits an `UnknownRest` token preserving
100//! all previously parsed symbols, allowing continued analysis.
101//!
102//! # Integration with perl-parser
103//!
104//! The lexer is designed to work seamlessly with `perl_parser_core::Parser`.
105//! You rarely need to use the lexer directly -- the parser creates and manages
106//! a `PerlLexer` instance internally:
107//!
108//! ```rust,ignore
109//! use perl_parser_core::Parser;
110//!
111//! let code = r#"sub hello { print "Hello, world!\n"; }"#;
112//! let mut parser = Parser::new(code);
113//! let ast = parser.parse().expect("should parse");
114//! ```
115
116#![allow(
117 // Core allows for lexer code
118 clippy::too_many_lines,
119 clippy::module_name_repetitions,
120 clippy::cast_possible_truncation,
121 clippy::cast_sign_loss,
122 clippy::cast_possible_wrap,
123 clippy::cast_precision_loss,
124 clippy::must_use_candidate,
125 clippy::missing_errors_doc,
126 clippy::missing_panics_doc,
127
128 // Lexer-specific patterns that are fine
129 clippy::match_same_arms,
130 clippy::redundant_else,
131 clippy::unnecessary_wraps,
132 clippy::unused_self,
133 clippy::items_after_statements,
134 clippy::struct_excessive_bools,
135 clippy::uninlined_format_args
136)]
137
138use std::sync::Arc;
139
140pub mod api;
141pub mod builtins;
142pub mod checkpoint;
143pub mod config;
144pub mod error;
145mod heredoc;
146pub mod keywords;
147mod lexer;
148pub mod limits;
149pub mod mode;
150mod quote_handler;
151pub mod token;
152pub mod tokenizer;
153mod unicode;
154
155pub use api::*;
156pub use checkpoint::{CheckpointCache, Checkpointable, LexerCheckpoint};
157pub use config::LexerConfig;
158pub use error::{LexerError, Result};
159pub use lexer::PerlLexer;
160pub use limits::MAX_REGEX_PARSE_STEPS;
161pub use mode::LexerMode;
162pub use perl_position_tracking::Position;
163pub use token::{StringPart, Token, TokenType};
164
165use unicode::{is_perl_identifier_continue, is_perl_identifier_start};
166
167use crate::heredoc::HeredocSpec;
168use crate::lexer::helpers::{
169 empty_arc, is_builtin_function, is_compound_operator, is_keyword_fast, is_quote_op_word_prefix,
170 truncate_preview,
171};
172use crate::limits::{
173 HEREDOC_TIMEOUT_MS, MAX_DELIM_NEST, MAX_HEREDOC_BYTES, MAX_HEREDOC_DEPTH, MAX_REGEX_BYTES,
174};
175
176impl<'a> PerlLexer<'a> {
177 /// Create a new lexer that emits `HeredocBody` tokens (for LSP folding)
178 pub fn with_body_tokens(input: &'a str) -> Self {
179 let mut lexer = Self::new(input);
180 lexer.emit_heredoc_body_tokens = true;
181 lexer
182 }
183
184 /// Set the lexer mode (for resetting state at statement boundaries)
185 pub fn set_mode(&mut self, mode: LexerMode) {
186 self.mode = mode;
187 }
188
189 /// Advance the lexer and return the next token.
190 ///
191 /// Returns `None` only after an `EOF` token has already been emitted.
192 /// The final meaningful call returns `Some(Token { token_type: TokenType::EOF, .. })`.
193 pub fn next_token(&mut self) -> Option<Token> {
194 // Normalize file start (BOM) once
195 if self.position == 0 {
196 self.normalize_file_start();
197 }
198 self.normalize_char_boundary();
199
200 // Loop to avoid recursion when processing heredocs
201 loop {
202 // Handle format body parsing if we're in that mode
203 if matches!(self.mode, LexerMode::InFormatBody) {
204 return self.parse_format_body();
205 }
206
207 // Handle data section parsing if we're in that mode
208 if matches!(self.mode, LexerMode::InDataSection) {
209 return self.parse_data_body();
210 }
211
212 // Check if we're inside a heredoc body BEFORE skipping whitespace
213 let mut found_terminator = false;
214 if !self.pending_heredocs.is_empty() {
215 // Clone what we need to avoid holding a borrow
216 let (body_start, label, allow_indent) =
217 if let Some(spec) = self.pending_heredocs.first() {
218 if spec.body_start > 0
219 && self.position >= spec.body_start
220 && self.position < self.input.len()
221 {
222 (spec.body_start, spec.label.clone(), spec.allow_indent)
223 } else {
224 // Not in a heredoc body yet or at EOF
225 (0, empty_arc(), false)
226 }
227 } else {
228 (0, empty_arc(), false)
229 };
230
231 if body_start > 0 {
232 // We're inside a heredoc body - scan for the terminator
233
234 // Scan line by line looking for the terminator
235 while self.position < self.input.len() {
236 // Timeout protection (Issue #443)
237 if self.start_time.elapsed().as_millis() > HEREDOC_TIMEOUT_MS as u128 {
238 self.pending_heredocs.remove(0);
239 self.position = self.input.len();
240 return Some(Token {
241 token_type: TokenType::Error(Arc::from("Heredoc parsing timeout")),
242 text: Arc::from(&self.input[body_start..]),
243 start: body_start,
244 end: self.input.len(),
245 });
246 }
247
248 // Budget cap for huge bodies - optimized check
249 if self.position - body_start > MAX_HEREDOC_BYTES {
250 // Remove the pending heredoc to avoid infinite loop
251 self.pending_heredocs.remove(0);
252 self.position = self.input.len();
253 return Some(Token {
254 token_type: TokenType::UnknownRest,
255 text: Arc::from(&self.input[body_start..]),
256 start: body_start,
257 end: self.input.len(),
258 });
259 }
260
261 // Skip to start of next line if not at line start
262 // Exception: if we're at body_start exactly, we're at the heredoc body start
263 if !self.after_newline && self.position != body_start {
264 while self.position < self.input.len()
265 && self.input_bytes[self.position] != b'\n'
266 && self.input_bytes[self.position] != b'\r'
267 {
268 self.advance();
269 }
270 self.consume_newline();
271 continue;
272 }
273
274 // We're at line start - check if this line is the terminator
275 let line_start = self.position;
276 let line_end = Self::find_line_end(self.input_bytes, self.position);
277 let line = &self.input[line_start..line_end];
278 // Strip trailing spaces/tabs (Perl allows them)
279 let trimmed_end = line.trim_end_matches([' ', '\t']);
280
281 // Check if this line is the terminator
282 let is_terminator = if allow_indent {
283 // Allow any leading spaces/tabs before the label
284 let mut p = 0;
285 while p < trimmed_end.len() {
286 let b = trimmed_end.as_bytes()[p];
287 if b == b' ' || b == b'\t' {
288 p += 1;
289 } else {
290 break;
291 }
292 }
293 trimmed_end[p..] == *label
294 } else {
295 // Must start at column 0 (no leading whitespace)
296 // The terminator is just the label (already trimmed trailing whitespace)
297 trimmed_end == &*label
298 };
299
300 if is_terminator {
301 // Found the terminator!
302 self.pending_heredocs.remove(0);
303 found_terminator = true;
304
305 // Consume past the terminator line
306 self.position = line_end;
307 self.consume_newline();
308
309 // Set body_start for the next pending heredoc (if any)
310 if let Some(next) = self.pending_heredocs.first_mut()
311 && next.body_start == 0
312 {
313 next.body_start = self.position;
314 }
315
316 // Only emit HeredocBody if requested (for folding)
317 if self.emit_heredoc_body_tokens {
318 return Some(Token {
319 token_type: TokenType::HeredocBody(empty_arc()),
320 text: empty_arc(),
321 start: body_start,
322 end: line_start,
323 });
324 }
325 // Otherwise, continue the outer loop to get the next real token (avoiding recursion)
326 break; // Break inner while loop, continue outer loop
327 }
328
329 // Not the terminator, continue to next line
330 self.position = line_end;
331 self.consume_newline();
332 }
333
334 // If we didn't find a terminator, we reached EOF - emit error token
335 if !found_terminator {
336 // Remove the pending heredoc to avoid infinite loop
337 self.pending_heredocs.remove(0);
338 self.position = self.input.len();
339 return Some(Token {
340 token_type: TokenType::UnknownRest,
341 text: Arc::from(&self.input[body_start..]),
342 start: body_start,
343 end: self.input.len(),
344 });
345 }
346 }
347
348 // If we found a terminator, continue outer loop to get next token
349 if found_terminator {
350 continue; // Continue outer loop to get next token
351 }
352 }
353
354 self.skip_whitespace_and_comments()?;
355
356 // Check again if we're now in a heredoc body (might have been set during skip_whitespace)
357 if !self.pending_heredocs.is_empty()
358 && let Some(spec) = self.pending_heredocs.first()
359 && spec.body_start > 0
360 && self.position >= spec.body_start
361 && self.position < self.input.len()
362 {
363 continue; // Go back to top of loop to process heredoc
364 }
365
366 // If we reach EOF with pending heredocs, clear them and emit EOF
367 if self.position >= self.input.len() && !self.pending_heredocs.is_empty() {
368 self.pending_heredocs.clear();
369 }
370
371 if self.position >= self.input.len() {
372 if self.eof_emitted {
373 return None; // Stop the stream
374 }
375 self.eof_emitted = true;
376 return Some(Token {
377 token_type: TokenType::EOF,
378 text: empty_arc(),
379 start: self.position,
380 end: self.position,
381 });
382 }
383
384 let start = self.position;
385
386 // Check for special tokens first
387 if let Some(token) = self.try_heredoc() {
388 return Some(token);
389 }
390
391 if let Some(token) = self.try_string() {
392 return Some(token);
393 }
394
395 if let Some(token) = self.try_variable() {
396 return Some(token);
397 }
398
399 if let Some(token) = self.try_number() {
400 return Some(token);
401 }
402
403 if let Some(token) = self.try_vstring() {
404 return Some(token);
405 }
406
407 if let Some(token) = self.try_identifier_or_keyword() {
408 return Some(token);
409 }
410
411 // If we're expecting a delimiter for a quote operator, only try delimiter
412 if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
413 if let Some(token) = self.try_delimiter() {
414 return Some(token);
415 }
416 // Do NOT fall through to try_operator / try_punct / etc.
417 // Clear state first so we don't spin
418 self.mode = LexerMode::ExpectOperator;
419 self.current_quote_op = None;
420 continue;
421 }
422
423 if let Some(token) = self.try_operator() {
424 return Some(token);
425 }
426
427 if let Some(token) = self.try_delimiter() {
428 return Some(token);
429 }
430
431 // If nothing else matches, return an error token
432 let ch = self.current_char()?;
433 self.advance();
434
435 // Optimize error token creation - avoid expensive formatting in hot path
436 let text = if ch.is_ascii() {
437 // Fast path for ASCII characters
438 Arc::from(&self.input[start..self.position])
439 } else {
440 // Unicode path without intermediate heap allocation
441 let mut buf = [0_u8; 4];
442 Arc::from(ch.encode_utf8(&mut buf))
443 };
444
445 return Some(Token {
446 token_type: TokenType::Error(Arc::from("Unexpected character")),
447 text,
448 start,
449 end: self.position,
450 });
451 } // End of loop
452 }
453
454 /// Budget guard to prevent infinite loops and timeouts (Issue #422)
455 ///
456 /// **Purpose**: Protect against pathological input that could cause:
457 /// - Infinite loops in regex/heredoc parsing
458 /// - Excessive memory consumption
459 /// - LSP server hangs
460 ///
461 /// **Limits**:
462 /// - `MAX_REGEX_BYTES` (64KB): Maximum bytes in a single regex literal
463 /// - `MAX_DELIM_NEST` (128): Maximum delimiter nesting depth
464 ///
465 /// **Graceful Degradation**:
466 /// - Budget exceeded → emit `UnknownRest` token
467 /// - Jump to EOF to prevent further parsing of problematic region
468 /// - LSP client can emit soft diagnostic about truncation
469 /// - All previously parsed symbols remain valid
470 ///
471 /// **Performance**:
472 /// - Fast path: inlined subtraction + comparison (~1-2 CPU cycles)
473 /// - Slow path: Only triggered on pathological input
474 /// - Amortized cost: O(1) per token
475 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
476 #[inline(always)]
477 fn budget_guard(&mut self, start: usize, depth: usize) -> Option<Token> {
478 // Fast path: most calls won't hit limits
479 let bytes_consumed = self.position - start;
480 if bytes_consumed <= MAX_REGEX_BYTES && depth <= MAX_DELIM_NEST {
481 return None;
482 }
483
484 // Slow path: budget exceeded - graceful degradation
485 #[cfg(debug_assertions)]
486 {
487 tracing::debug!(
488 bytes_consumed,
489 depth,
490 position = self.position,
491 "Lexer budget exceeded"
492 );
493 }
494
495 self.position = self.input.len();
496 Some(Token {
497 token_type: TokenType::UnknownRest,
498 text: Arc::from(""),
499 start,
500 end: self.position,
501 })
502 }
503
504 /// Peek at the next token without consuming it.
505 ///
506 /// Saves and restores the full lexer state so the next call to
507 /// [`next_token`](Self::next_token) returns the same token.
508 pub fn peek_token(&mut self) -> Option<Token> {
509 let saved_pos = self.position;
510 let saved_mode = self.mode;
511 let saved_delimiter_stack = self.delimiter_stack.clone();
512 let saved_prototype = self.in_prototype;
513 let saved_depth = self.prototype_depth;
514 let saved_after_sub = self.after_sub;
515 let saved_after_arrow = self.after_arrow;
516 let saved_hash_brace_depth = self.hash_brace_depth;
517 let saved_after_var_subscript = self.after_var_subscript;
518 let saved_paren_depth = self.paren_depth;
519 let saved_current_pos = self.current_pos;
520 let saved_after_newline = self.after_newline;
521 let saved_pending_heredocs = self.pending_heredocs.clone();
522 let saved_line_start_offset = self.line_start_offset;
523 let saved_current_quote_op = self.current_quote_op.clone();
524 let saved_eof_emitted = self.eof_emitted;
525 let saved_start_time = self.start_time;
526
527 let token = self.next_token();
528
529 self.position = saved_pos;
530 self.mode = saved_mode;
531 self.delimiter_stack = saved_delimiter_stack;
532 self.in_prototype = saved_prototype;
533 self.prototype_depth = saved_depth;
534 self.after_sub = saved_after_sub;
535 self.after_arrow = saved_after_arrow;
536 self.hash_brace_depth = saved_hash_brace_depth;
537 self.after_var_subscript = saved_after_var_subscript;
538 self.paren_depth = saved_paren_depth;
539 self.current_pos = saved_current_pos;
540 self.after_newline = saved_after_newline;
541 self.pending_heredocs = saved_pending_heredocs;
542 self.line_start_offset = saved_line_start_offset;
543 self.current_quote_op = saved_current_quote_op;
544 self.eof_emitted = saved_eof_emitted;
545 self.start_time = saved_start_time;
546
547 token
548 }
549
550 /// Consume all remaining tokens and return them as a vector.
551 ///
552 /// The returned vector always ends with an `EOF` token.
553 pub fn collect_tokens(&mut self) -> Vec<Token> {
554 let mut tokens = Vec::new();
555 while let Some(token) = self.next_token() {
556 if token.token_type == TokenType::EOF {
557 tokens.push(token);
558 break;
559 }
560 tokens.push(token);
561 }
562 tokens
563 }
564
565 /// Reset the lexer to the beginning of the input.
566 ///
567 /// Clears all internal state (mode, delimiter stack, heredoc queue, etc.)
568 /// so the lexer can re-tokenize the same source from scratch.
569 pub fn reset(&mut self) {
570 self.position = 0;
571 self.mode = LexerMode::ExpectTerm;
572 self.delimiter_stack.clear();
573 self.in_prototype = false;
574 self.prototype_depth = 0;
575 self.after_sub = false;
576 self.after_arrow = false;
577 self.hash_brace_depth = 0;
578 self.after_var_subscript = false;
579 self.paren_depth = 0;
580 self.current_pos = Position::start();
581 self.after_newline = true;
582 self.pending_heredocs.clear();
583 self.line_start_offset = 0;
584 self.current_quote_op = None;
585 self.eof_emitted = false;
586 self.start_time = std::time::Instant::now();
587 }
588
589 /// Switch the lexer into format-body parsing mode.
590 ///
591 /// In this mode the lexer consumes input verbatim until it encounters a
592 /// line containing only `.` (the Perl format terminator).
593 pub fn enter_format_mode(&mut self) {
594 self.mode = LexerMode::InFormatBody;
595 }
596
597 // Token-specific parsing methods
598
599 #[inline]
600 fn skip_whitespace_and_comments(&mut self) -> Option<()> {
601 // Don't reset after_newline if we're at the start of a line
602 if self.position > 0 && self.position != self.line_start_offset {
603 self.after_newline = false;
604 }
605
606 while self.position < self.input_bytes.len() {
607 let byte = Self::byte_at(self.input_bytes, self.position);
608 match byte {
609 // Fast path for ASCII whitespace - batch process
610 b' ' => {
611 // Batch skip spaces for better cache efficiency
612 let start = self.position;
613 while self.position < self.input_bytes.len()
614 && Self::byte_at(self.input_bytes, self.position) == b' '
615 {
616 self.position += 1;
617 }
618 // Continue outer loop if we processed any spaces
619 if self.position > start {
620 // Loop naturally continues to next iteration
621 }
622 }
623 b'\t' | 0x0B | 0x0C => {
624 // Batch skip horizontal tab, vertical tab, and form feed.
625 // Perl treats these as whitespace separators.
626 let start = self.position;
627 while self.position < self.input_bytes.len()
628 && matches!(
629 Self::byte_at(self.input_bytes, self.position),
630 b'\t' | 0x0B | 0x0C
631 )
632 {
633 self.position += 1;
634 }
635 if self.position > start {
636 // Loop naturally continues to next iteration
637 }
638 }
639 b'\r' | b'\n' => {
640 self.consume_newline();
641
642 // Set body_start for the FIRST pending heredoc that needs it (FIFO)
643 // Only check if we have pending heredocs to avoid unnecessary work
644 if !self.pending_heredocs.is_empty() {
645 for spec in &mut self.pending_heredocs {
646 if spec.body_start == 0 {
647 spec.body_start = self.position;
648 break; // Only set for the first unresolved heredoc
649 }
650 }
651 }
652 }
653 b'#' => {
654 // In ExpectDelimiter mode, '#' is a delimiter, not a comment
655 if matches!(self.mode, LexerMode::ExpectDelimiter) {
656 break;
657 }
658
659 // Skip line comment using memchr for fast newline search
660 self.position += 1; // Skip # directly
661
662 // Use memchr2 to find CR/LF line endings quickly (supports LF, CRLF, and CR)
663 if let Some(newline_offset) =
664 memchr::memchr2(b'\n', b'\r', &self.input_bytes[self.position..])
665 {
666 self.position += newline_offset;
667 } else {
668 // No newline found, skip to end
669 self.position = self.input_bytes.len();
670 }
671 }
672 b'=' if self.position == 0
673 || (self.position > 0
674 && matches!(self.input_bytes[self.position - 1], b'\n' | b'\r')) =>
675 {
676 // Check if this starts a POD section (=pod, =head, =over, etc.)
677 // Use byte-safe checks — avoid slicing &str at arbitrary byte positions
678 let remaining = &self.input_bytes[self.position..];
679 if remaining.starts_with(b"=pod")
680 || remaining.starts_with(b"=head")
681 || remaining.starts_with(b"=over")
682 || remaining.starts_with(b"=item")
683 || remaining.starts_with(b"=back")
684 || remaining.starts_with(b"=begin")
685 || remaining.starts_with(b"=end")
686 || remaining.starts_with(b"=for")
687 || remaining.starts_with(b"=encoding")
688 {
689 // Scan forward for \n=cut (end of POD block)
690 let search_start = self.position;
691 let mut found_cut = false;
692 let bytes = self.input_bytes;
693 let mut i = search_start;
694 while i < bytes.len() {
695 // Look for =cut at the start of a line
696 if (i == 0 || matches!(bytes[i - 1], b'\n' | b'\r'))
697 && bytes[i..].starts_with(b"=cut")
698 {
699 i += 4; // Skip "=cut"
700 // Skip rest of the =cut line
701 while i < bytes.len() && bytes[i] != b'\n' && bytes[i] != b'\r' {
702 i += 1;
703 }
704 // Consume one line ending sequence if present
705 if i < bytes.len() && bytes[i] == b'\r' {
706 i += 1;
707 if i < bytes.len() && bytes[i] == b'\n' {
708 i += 1;
709 }
710 } else if i < bytes.len() && bytes[i] == b'\n' {
711 i += 1;
712 }
713 self.position = i;
714 found_cut = true;
715 break;
716 }
717 i += 1;
718 }
719 if !found_cut {
720 // POD extends to end of file
721 self.position = bytes.len();
722 }
723 continue;
724 }
725 // Not a POD directive - regular '=' token
726 break;
727 }
728 _ => {
729 // For non-ASCII whitespace, use char check only when needed
730 if byte >= 128
731 && let Some(ch) = self.current_char()
732 && ch.is_whitespace()
733 {
734 self.advance();
735 continue;
736 }
737 break;
738 }
739 }
740 }
741 Some(())
742 }
743
744 fn try_heredoc(&mut self) -> Option<Token> {
745 // `<<` is the left-shift operator, not a heredoc, when we are inside
746 // a parenthesized expression and have just finished a term.
747 // E.g. `(1<<index(...))` — the `1` sets ExpectOperator and paren_depth > 0,
748 // so `<<index` must be the bitshift operator, not a heredoc start.
749 //
750 // We must NOT fire the guard at statement level (paren_depth == 0) because
751 // `print $fh <<END` is valid Perl: `$fh` sets ExpectOperator but `<<END`
752 // is a heredoc. The depth check distinguishes the two cases.
753 if self.mode == LexerMode::ExpectOperator && self.paren_depth > 0 {
754 return None;
755 }
756
757 // Check for heredoc start
758 if self.peek_byte(0) != Some(b'<') || self.peek_byte(1) != Some(b'<') {
759 return None;
760 }
761
762 let start = self.position;
763 let mut text = String::from("<<");
764 self.position += 2; // Skip <<
765
766 // Check for indented heredoc (~)
767 let allow_indent = if self.current_char() == Some('~') {
768 text.push('~');
769 self.advance();
770 true
771 } else {
772 false
773 };
774
775 // Skip whitespace
776 while let Some(ch) = self.current_char() {
777 if ch == ' ' || ch == '\t' {
778 text.push(ch);
779 self.advance();
780 } else {
781 break;
782 }
783 }
784
785 // Optional backslash disables interpolation, treat like single-quoted label
786 let backslashed = if self.current_char() == Some('\\') {
787 text.push('\\');
788 self.advance();
789 true
790 } else {
791 false
792 };
793
794 // Parse delimiter
795 let delimiter = if self.position < self.input.len() {
796 match self.current_char() {
797 Some('"') if !backslashed => self.parse_quoted_heredoc_delimiter('"', &mut text)?,
798 Some('\'') if !backslashed => {
799 self.parse_quoted_heredoc_delimiter('\'', &mut text)?
800 }
801 Some('`') if !backslashed => self.parse_quoted_heredoc_delimiter('`', &mut text)?,
802 Some(c) if is_perl_identifier_start(c) => {
803 // Bare word delimiter
804 let mut delim = String::new();
805 while self.position < self.input.len() {
806 if let Some(c) = self.current_char() {
807 if is_perl_identifier_continue(c) {
808 delim.push(c);
809 text.push(c);
810 self.advance();
811 } else {
812 break;
813 }
814 } else {
815 break;
816 }
817 }
818 delim
819 }
820 _ => {
821 // Not a valid heredoc delimiter - reset position and return None
822 // This allows << to be parsed as bitshift operator (e.g., 1 << 2)
823 self.position = start;
824 return None;
825 }
826 }
827 } else {
828 // No delimiter found - reset position and return None
829 self.position = start;
830 return None;
831 };
832
833 // For now, return a placeholder token
834 // The actual heredoc body would be parsed later when we encounter it
835 self.mode = LexerMode::ExpectOperator;
836
837 // Recursion depth limit (Issue #443)
838 if self.pending_heredocs.len() >= MAX_HEREDOC_DEPTH {
839 return Some(Token {
840 token_type: TokenType::Error(Arc::from("Heredoc nesting too deep")),
841 text: Arc::from(text),
842 start,
843 end: self.position,
844 });
845 }
846
847 // Queue the heredoc spec with its label
848 self.pending_heredocs.push(HeredocSpec {
849 label: Arc::from(delimiter.as_str()),
850 body_start: 0, // Will be set when we see the newline after this line
851 allow_indent,
852 });
853
854 Some(Token {
855 token_type: TokenType::HeredocStart,
856 text: Arc::from(text),
857 start,
858 end: self.position,
859 })
860 }
861
862 fn try_string(&mut self) -> Option<Token> {
863 let start = self.position;
864 let quote = self.current_char()?;
865
866 match quote {
867 '"' => self.parse_double_quoted_string(start),
868 '\'' => self.parse_single_quoted_string(start),
869 '`' => self.parse_backtick_string(start),
870 'q' if self.peek_char(1) == Some('{') => self.parse_q_string(start),
871 _ => None,
872 }
873 }
874
875 #[inline]
876 fn try_number(&mut self) -> Option<Token> {
877 let start = self.position;
878
879 // Fast byte check for digits - optimized bounds checking
880 let bytes = self.input_bytes;
881 if self.position >= bytes.len() || !Self::byte_at(bytes, self.position).is_ascii_digit() {
882 return None;
883 }
884
885 // Check for hex (0x), binary (0b), or octal (0o) prefixes
886 let mut pos = self.position;
887 if Self::byte_at(bytes, pos) == b'0' && pos + 1 < bytes.len() {
888 let prefix_byte = bytes[pos + 1];
889 if prefix_byte == b'x' || prefix_byte == b'X' {
890 // Hexadecimal: 0x[0-9a-fA-F_]+
891 pos += 2; // consume '0x'
892 let digit_start = pos;
893 let mut saw_digit = false;
894 while pos < bytes.len() && (bytes[pos].is_ascii_hexdigit() || bytes[pos] == b'_') {
895 saw_digit |= bytes[pos].is_ascii_hexdigit();
896 pos += 1;
897 }
898 if pos > digit_start && saw_digit {
899 self.position = pos;
900 let text = &self.input[start..self.position];
901 self.mode = LexerMode::ExpectOperator;
902 return Some(Token {
903 token_type: TokenType::Number(Arc::from(text)),
904 text: Arc::from(text),
905 start,
906 end: self.position,
907 });
908 }
909 // No hex digits after 0x - fall through to parse '0' as decimal
910 } else if prefix_byte == b'b' || prefix_byte == b'B' {
911 // Binary: 0b[01_]+
912 pos += 2; // consume '0b'
913 let digit_start = pos;
914 let mut saw_digit = false;
915 while pos < bytes.len()
916 && (bytes[pos] == b'0' || bytes[pos] == b'1' || bytes[pos] == b'_')
917 {
918 saw_digit |= bytes[pos] == b'0' || bytes[pos] == b'1';
919 pos += 1;
920 }
921 if pos > digit_start && saw_digit {
922 self.position = pos;
923 let text = &self.input[start..self.position];
924 self.mode = LexerMode::ExpectOperator;
925 return Some(Token {
926 token_type: TokenType::Number(Arc::from(text)),
927 text: Arc::from(text),
928 start,
929 end: self.position,
930 });
931 }
932 // No binary digits after 0b - fall through to parse '0' as decimal
933 } else if prefix_byte == b'o' || prefix_byte == b'O' {
934 // Octal (explicit): 0o[0-7_]+
935 pos += 2; // consume '0o'
936 let digit_start = pos;
937 let mut saw_digit = false;
938 while pos < bytes.len()
939 && ((bytes[pos] >= b'0' && bytes[pos] <= b'7') || bytes[pos] == b'_')
940 {
941 saw_digit |= (b'0'..=b'7').contains(&bytes[pos]);
942 pos += 1;
943 }
944 if pos > digit_start && saw_digit {
945 self.position = pos;
946 let text = &self.input[start..self.position];
947 self.mode = LexerMode::ExpectOperator;
948 return Some(Token {
949 token_type: TokenType::Number(Arc::from(text)),
950 text: Arc::from(text),
951 start,
952 end: self.position,
953 });
954 }
955 // No octal digits after 0o - fall through to parse '0' as decimal
956 }
957 }
958
959 // Consume initial digits - unrolled for better performance
960 pos = self.position;
961 while pos < bytes.len() {
962 let byte = Self::byte_at(bytes, pos);
963 if byte.is_ascii_digit() || byte == b'_' {
964 pos += 1;
965 } else {
966 break;
967 }
968 }
969 self.position = pos;
970
971 // Check for decimal point - optimized with single bounds check
972 if pos < bytes.len() && Self::byte_at(bytes, pos) == b'.' {
973 // Peek ahead to see what follows the dot
974 let has_following_digit = pos + 1 < bytes.len() && bytes[pos + 1].is_ascii_digit();
975
976 // Optimized dot consumption logic
977 let should_consume_dot = has_following_digit || {
978 pos + 1 >= bytes.len() || {
979 // Use bitwise operations for faster character classification
980 let next_byte = bytes[pos + 1];
981 // Whitespace, delimiters, operators - optimized check
982 next_byte <= b' '
983 || matches!(
984 next_byte,
985 b';' | b','
986 | b')'
987 | b'}'
988 | b']'
989 | b'+'
990 | b'-'
991 | b'*'
992 | b'/'
993 | b'%'
994 | b'='
995 | b'<'
996 | b'>'
997 | b'!'
998 | b'&'
999 | b'|'
1000 | b'^'
1001 | b'~'
1002 | b'e'
1003 | b'E'
1004 )
1005 }
1006 };
1007
1008 if should_consume_dot {
1009 pos += 1; // consume the dot
1010 // Consume fractional digits - batch processing
1011 while pos < bytes.len() && (bytes[pos].is_ascii_digit() || bytes[pos] == b'_') {
1012 pos += 1;
1013 }
1014 self.position = pos;
1015 }
1016 }
1017
1018 // Check for exponent - optimized
1019 if pos < bytes.len() && (bytes[pos] == b'e' || bytes[pos] == b'E') {
1020 let exp_start = pos;
1021 pos += 1; // consume 'e' or 'E'
1022
1023 // Check for optional sign
1024 if pos < bytes.len() && (bytes[pos] == b'+' || bytes[pos] == b'-') {
1025 pos += 1;
1026 }
1027
1028 // Must have at least one digit after exponent (underscores allowed between digits)
1029 let mut saw_digit = false;
1030 while pos < bytes.len() {
1031 let byte = bytes[pos];
1032 if byte.is_ascii_digit() {
1033 saw_digit = true;
1034 pos += 1;
1035 } else if byte == b'_' {
1036 pos += 1;
1037 } else {
1038 break;
1039 }
1040 }
1041
1042 // If no digits after exponent, backtrack
1043 if !saw_digit {
1044 pos = exp_start;
1045 }
1046
1047 self.position = pos;
1048 }
1049
1050 // Avoid string slicing for common number cases - use Arc::from directly on slice
1051 let text = &self.input[start..self.position];
1052 self.mode = LexerMode::ExpectOperator;
1053
1054 Some(Token {
1055 token_type: TokenType::Number(Arc::from(text)),
1056 text: Arc::from(text),
1057 start,
1058 end: self.position,
1059 })
1060 }
1061
1062 fn parse_decimal_number(&mut self, start: usize) -> Option<Token> {
1063 // We're at the dot, consume it
1064 self.advance();
1065
1066 // Parse the fractional part
1067 while self.position < self.input_bytes.len() {
1068 let byte = self.input_bytes[self.position];
1069 match byte {
1070 b'0'..=b'9' | b'_' => self.position += 1,
1071 b'e' | b'E' => {
1072 // Handle scientific notation.
1073 // Save the position of 'e'/'E' so we can backtrack here if
1074 // no digits follow the exponent marker (with or without sign).
1075 let e_pos = self.position;
1076 self.advance();
1077 if self.position < self.input_bytes.len() {
1078 let next = self.input_bytes[self.position];
1079 if next == b'+' || next == b'-' {
1080 self.advance();
1081 }
1082 }
1083 // Parse exponent digits (underscores allowed between digits)
1084 let mut saw_digit = false;
1085 while self.position < self.input_bytes.len() {
1086 let byte = self.input_bytes[self.position];
1087 if byte.is_ascii_digit() {
1088 saw_digit = true;
1089 self.position += 1;
1090 } else if byte == b'_' {
1091 self.position += 1;
1092 } else {
1093 break;
1094 }
1095 }
1096
1097 // No digits after exponent marker — backtrack to just before
1098 // 'e'/'E' so the caller sees it as a separate token.
1099 // Using e_pos (not exponent_start-1) avoids including 'e' in
1100 // the number slice when a sign character was consumed.
1101 if !saw_digit {
1102 self.position = e_pos;
1103 }
1104 break;
1105 }
1106 _ => break,
1107 }
1108 }
1109
1110 let text = &self.input[start..self.position];
1111 self.mode = LexerMode::ExpectOperator;
1112
1113 Some(Token {
1114 token_type: TokenType::Number(Arc::from(text)),
1115 text: Arc::from(text),
1116 start,
1117 end: self.position,
1118 })
1119 }
1120
1121 fn try_variable(&mut self) -> Option<Token> {
1122 let start = self.position;
1123 let sigil = self.current_char()?;
1124
1125 match sigil {
1126 '$' | '@' | '%' | '*' => {
1127 // In ExpectOperator mode, treat % and * as operators rather than sigils
1128 if self.mode == LexerMode::ExpectOperator && matches!(sigil, '*' | '%') {
1129 return None;
1130 }
1131 self.advance();
1132
1133 // Special case: After ->, sigils followed by { or [ should be tokenized separately
1134 // This is for postfix dereference like ->@*, ->%{}, ->@[]
1135 // We need to be careful with Unicode - check if we have enough bytes and valid char boundaries
1136 let check_arrow = self.position >= 3
1137 && self.position.saturating_sub(1) <= self.input.len()
1138 && self.input.is_char_boundary(self.position.saturating_sub(3))
1139 && self.input.is_char_boundary(self.position.saturating_sub(1));
1140
1141 if check_arrow
1142 && {
1143 let saved = self.position;
1144 self.position -= 3;
1145 let arrow = self.matches_bytes(b"->");
1146 self.position = saved;
1147 arrow
1148 }
1149 && matches!(self.current_char(), Some('{' | '[' | '*'))
1150 {
1151 // Just return the sigil
1152 let text = &self.input[start..self.position];
1153 self.mode = LexerMode::ExpectOperator;
1154
1155 return Some(Token {
1156 token_type: TokenType::Identifier(Arc::from(text)),
1157 text: Arc::from(text),
1158 start,
1159 end: self.position,
1160 });
1161 }
1162
1163 // Check for $# (array length operator)
1164 if sigil == '$' && self.current_char() == Some('#') {
1165 self.advance(); // consume #
1166 // Now parse the array name
1167 while let Some(ch) = self.current_char() {
1168 if is_perl_identifier_continue(ch) {
1169 self.advance();
1170 } else if ch == ':' && self.peek_char(1) == Some(':') {
1171 // Package-qualified array name
1172 self.advance();
1173 self.advance();
1174 } else {
1175 break;
1176 }
1177 }
1178
1179 let text = &self.input[start..self.position];
1180 self.mode = LexerMode::ExpectOperator;
1181 // $#foo is a complete variable token; a following `{` is a subscript.
1182 self.after_var_subscript = true;
1183
1184 return Some(Token {
1185 token_type: TokenType::Identifier(Arc::from(text)),
1186 text: Arc::from(text),
1187 start,
1188 end: self.position,
1189 });
1190 }
1191
1192 // Check for special cases like ${^MATCH} or ${::{foo}} or *{$glob}
1193 if self.current_char() == Some('{') {
1194 // Peek ahead to decide if we should consume the brace
1195 let next_char = self.peek_char(1);
1196
1197 // Check if this is a dereference like @{$ref} or @{[...]}
1198 // If the next char suggests dereference, don't consume the brace.
1199 // For @ and % sigils, identifiers inside braces are also derefs
1200 // (e.g. @{Foo::Bar::baz} or %{Some::Hash}).
1201 let is_deref = sigil != '*'
1202 && (matches!(
1203 next_char,
1204 Some('$' | '@' | '%' | '*' | '&' | '[' | ' ' | '\t' | '\n' | '\r',)
1205 ) || (matches!(sigil, '@' | '%')
1206 && next_char.is_some_and(is_perl_identifier_start)));
1207 if is_deref {
1208 // This is a dereference, don't consume the brace
1209 let text = &self.input[start..self.position];
1210 self.mode = LexerMode::ExpectOperator;
1211 // A standalone sigil token before `{` starts a dereference
1212 // sequence (e.g. `${$ref}` / `@{$aref}` / `%{$href}` / `&{$cref}`).
1213 // Mark it as subscript-capable so `{` increments brace depth
1214 // and the closing `}` can enable chained `{...}` subscripts.
1215 // (Broader form than master's `$|@|%` filter — `*` is already
1216 // excluded by the `is_deref` guard above and `&` deref also
1217 // benefits from chained-subscript handling.)
1218 self.after_var_subscript = true;
1219
1220 return Some(Token {
1221 token_type: TokenType::Identifier(Arc::from(text)),
1222 text: Arc::from(text),
1223 start,
1224 end: self.position,
1225 });
1226 }
1227
1228 self.advance(); // consume {
1229
1230 // Handle special variables with caret
1231 if self.current_char() == Some('^') {
1232 self.advance(); // consume ^
1233 // Parse the special variable name
1234 while let Some(ch) = self.current_char() {
1235 if ch == '}' {
1236 self.advance(); // consume }
1237 break;
1238 } else if is_perl_identifier_continue(ch) {
1239 self.advance();
1240 } else {
1241 break;
1242 }
1243 }
1244 }
1245 // Handle stash access like $::{foo}
1246 else if self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1247 self.advance(); // consume first :
1248 self.advance(); // consume second :
1249 // Skip optional { and }
1250 if self.current_char() == Some('{') {
1251 self.advance();
1252 }
1253 // Parse the name
1254 while let Some(ch) = self.current_char() {
1255 if ch == '}' {
1256 self.advance();
1257 if self.current_char() == Some('}') {
1258 self.advance(); // consume closing } of ${...}
1259 }
1260 break;
1261 } else if is_perl_identifier_continue(ch) {
1262 self.advance();
1263 } else {
1264 break;
1265 }
1266 }
1267 }
1268 // Regular braced variable like ${foo} or glob like *{$glob}
1269 else {
1270 // Check if this is a dereference like ${$ref} or @{$ref} or @{[...]}
1271 // If the next char is a sigil or other expression starter, we should stop here and let the parser handle it
1272 // EXCEPT for globs - *{$glob} should be parsed as one token
1273 // Also check for empty braces or EOF - in these cases we should split the tokens
1274 if sigil != '*'
1275 && !self.current_char().is_some_and(is_perl_identifier_start)
1276 {
1277 // This is a dereference or empty/invalid brace, backtrack
1278 self.position = start + 1; // Just past the sigil
1279 let text = &self.input[start..self.position];
1280 self.mode = LexerMode::ExpectOperator;
1281 // Same as above: sigil-only token means a dereference opener.
1282 self.after_var_subscript = true;
1283
1284 return Some(Token {
1285 token_type: TokenType::Identifier(Arc::from(text)),
1286 text: Arc::from(text),
1287 start,
1288 end: self.position,
1289 });
1290 }
1291
1292 // For glob access, we need to consume everything inside braces
1293 if sigil == '*' {
1294 let mut brace_depth: usize = 1;
1295 while let Some(ch) = self.current_char() {
1296 if ch == '{' {
1297 brace_depth += 1;
1298 } else if ch == '}' {
1299 brace_depth = brace_depth.saturating_sub(1);
1300 if brace_depth == 0 {
1301 self.advance(); // consume final }
1302 break;
1303 }
1304 }
1305 self.advance();
1306 }
1307 } else {
1308 // Regular variable
1309 while let Some(ch) = self.current_char() {
1310 if ch == '}' {
1311 self.advance(); // consume }
1312 break;
1313 } else if is_perl_identifier_continue(ch) {
1314 self.advance();
1315 } else {
1316 break;
1317 }
1318 }
1319 }
1320 }
1321 }
1322 // Parse regular variable name
1323 else if let Some(ch) = self.current_char() {
1324 if is_perl_identifier_start(ch) {
1325 while let Some(ch) = self.current_char() {
1326 if is_perl_identifier_continue(ch) {
1327 self.advance();
1328 } else {
1329 break;
1330 }
1331 }
1332 // Handle package-qualified segments like Foo::bar
1333 while self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1334 self.advance();
1335 self.advance();
1336 while let Some(ch) = self.current_char() {
1337 if is_perl_identifier_continue(ch) {
1338 self.advance();
1339 } else {
1340 break;
1341 }
1342 }
1343 }
1344 }
1345 // Handle $^Letter (e.g. $^W, $^O, $^X) and bare $^ (format_top_name)
1346 // Not inside prototypes where ^ is a literal prototype char
1347 else if sigil == '$' && ch == '^' && !self.in_prototype {
1348 self.advance(); // consume ^
1349 // $^Letter: consume the single uppercase letter
1350 if let Some(letter) = self.current_char()
1351 && letter.is_ascii_uppercase()
1352 {
1353 self.advance();
1354 }
1355 // bare $^ (no uppercase letter follows): format_top_name — stop here
1356 }
1357 // Handle special punctuation variables
1358 // Not inside prototypes where ; and , are literal prototype chars
1359 else if sigil == '$'
1360 && !self.in_prototype
1361 && matches!(
1362 ch,
1363 '?' | '!'
1364 | '@'
1365 | '&'
1366 | '`'
1367 | '\''
1368 | '.'
1369 | '/'
1370 | '\\'
1371 | '|'
1372 | '+'
1373 | '-'
1374 | '['
1375 | ']'
1376 | '$'
1377 | '~'
1378 | '='
1379 | '%'
1380 | ','
1381 | '"'
1382 | ';'
1383 | '>'
1384 | '<'
1385 | ')'
1386 | '(' // $( = real group ID of this process
1387 )
1388 {
1389 self.advance(); // consume the special character
1390 }
1391 // $$ is the PID special variable, but only when it is not immediately
1392 // followed by an identifier-start character. $$var is scalar dereference
1393 // of $var, so keep the second $ for the next token.
1394 else if sigil == '$' && ch == '$' {
1395 if !self.peek_char(1).is_some_and(is_perl_identifier_start) {
1396 self.advance(); // consume the second $ for bare $$ PID
1397 }
1398 }
1399 // Handle special array/hash punctuation variables
1400 else if (sigil == '@' || sigil == '%') && matches!(ch, '+' | '-') {
1401 self.advance(); // consume the + or -
1402 }
1403 }
1404
1405 let text = &self.input[start..self.position];
1406 self.mode = LexerMode::ExpectOperator;
1407 // A complete $foo, @foo, %foo token can be followed by a hash/slice
1408 // subscript `{`. Set the flag so the `{` handler knows to increment
1409 // hash_brace_depth. Glob tokens (*foo) are excluded: they don't take
1410 // hash subscripts in the same way.
1411 self.after_var_subscript = matches!(sigil, '$' | '@' | '%');
1412
1413 Some(Token {
1414 token_type: TokenType::Identifier(Arc::from(text)),
1415 text: Arc::from(text),
1416 start,
1417 end: self.position,
1418 })
1419 }
1420 _ => None,
1421 }
1422 }
1423
1424 /// Return the next non-space char and the char immediately following it (without consuming).
1425 /// Used to detect quote-operator delimiters while distinguishing `=>` (fat-arrow autoquote)
1426 /// from `=` used as a plain delimiter.
1427 fn peek_nonspace_and_following(&self) -> (Option<char>, Option<char>) {
1428 let mut i = self.position;
1429 while i < self.input.len() {
1430 let c = match self.input.get(i..).and_then(|s| s.chars().next()) {
1431 Some(c) => c,
1432 None => return (None, None),
1433 };
1434 if c.is_whitespace() {
1435 i += c.len_utf8();
1436 continue;
1437 }
1438 // Found non-space at position i; peek the next char after it
1439 let j = i + c.len_utf8();
1440 let following = self.input.get(j..).and_then(|s| s.chars().next());
1441 return (Some(c), following);
1442 }
1443 (None, None)
1444 }
1445
1446 /// Is `c` a valid quote-like delimiter? (non-alnum, including paired)
1447 fn is_quote_delim(c: char) -> bool {
1448 // Perl allows any non-alphanumeric, non-whitespace character as delimiter,
1449 // including control characters (e.g. s\x07pattern\x07replacement\x07).
1450 !c.is_ascii_alphanumeric() && !c.is_whitespace()
1451 }
1452
1453 #[inline]
1454 fn immediately_follows_sigil_prefix(&self, start: usize) -> bool {
1455 start > 0
1456 && matches!(
1457 Self::byte_at(self.input_bytes, start.saturating_sub(1)),
1458 b'$' | b'@' | b'%' | b'&' | b'*'
1459 )
1460 }
1461
1462 /// Try to parse a v-string (version string) like `v5.26.0` or `v5.10`.
1463 ///
1464 /// A v-string starts with `v` followed by one or more digits, then optionally
1465 /// `.` followed by digits, repeated. The `v` prefix distinguishes these from
1466 /// normal identifiers. Examples: `v5.26.0`, `v5.10`, `v1.2.3.4`.
1467 #[inline]
1468 fn try_vstring(&mut self) -> Option<Token> {
1469 let start = self.position;
1470 let bytes = self.input_bytes;
1471
1472 // Must start with 'v' followed by at least one digit
1473 if start >= bytes.len() || bytes[start] != b'v' {
1474 return None;
1475 }
1476
1477 let next_pos = start + 1;
1478 if next_pos >= bytes.len() || !bytes[next_pos].is_ascii_digit() {
1479 return None;
1480 }
1481
1482 // We have `v` followed by a digit — scan the rest of the v-string.
1483 // Pattern: v DIGITS (.DIGITS)*
1484 let mut pos = next_pos;
1485
1486 // Consume leading digits
1487 while pos < bytes.len() && bytes[pos].is_ascii_digit() {
1488 pos += 1;
1489 }
1490
1491 // Consume optional `.DIGITS` segments (require at least one digit after dot)
1492 while pos < bytes.len() && bytes[pos] == b'.' {
1493 let dot_pos = pos;
1494 pos += 1; // skip '.'
1495
1496 if pos >= bytes.len() || !bytes[pos].is_ascii_digit() {
1497 // Dot not followed by digit — not part of the v-string
1498 pos = dot_pos;
1499 break;
1500 }
1501
1502 // Consume digits after the dot
1503 while pos < bytes.len() && bytes[pos].is_ascii_digit() {
1504 pos += 1;
1505 }
1506 }
1507
1508 // Make sure the v-string isn't followed by identifier-continuation characters
1509 // (e.g. `v5x` should remain an identifier, not a v-string `v5` + `x`)
1510 if pos < bytes.len() {
1511 let next_byte = bytes[pos];
1512 if next_byte == b'_' || next_byte.is_ascii_alphabetic() {
1513 return None;
1514 }
1515 // Also check for non-ASCII identifier continuations
1516 if next_byte >= 128
1517 && let Some(ch) = self.input.get(pos..).and_then(|s| s.chars().next())
1518 && is_perl_identifier_continue(ch)
1519 {
1520 return None;
1521 }
1522 }
1523
1524 // `v5` (no dots) is a valid Perl v-string meaning chr(5).
1525 let text = &self.input[start..pos];
1526
1527 self.position = pos;
1528 self.mode = LexerMode::ExpectOperator;
1529
1530 Some(Token {
1531 token_type: TokenType::Version(Arc::from(text)),
1532 text: Arc::from(text),
1533 start,
1534 end: self.position,
1535 })
1536 }
1537
1538 #[inline]
1539 fn apostrophe_starts_legacy_package_segment(&self, position: usize) -> bool {
1540 let next_position = position + '\''.len_utf8();
1541 self.input
1542 .get(next_position..)
1543 .and_then(|suffix| suffix.chars().next())
1544 .is_some_and(is_perl_identifier_start)
1545 }
1546
1547 #[inline]
1548 fn try_identifier_or_keyword(&mut self) -> Option<Token> {
1549 let start = self.position;
1550 let ch = self.current_char()?;
1551 let bytes = self.input_bytes;
1552 let len = bytes.len();
1553
1554 if is_perl_identifier_start(ch) {
1555 // Special case: substitution/transliteration with single-quote delimiter
1556 // The single quote is considered an identifier continuation, so we need to
1557 // detect these operators before consuming it as part of an identifier.
1558 let follows_sigil_prefix = self.immediately_follows_sigil_prefix(start);
1559 if !follows_sigil_prefix
1560 && !self.after_arrow
1561 && self.hash_brace_depth == 0
1562 && ch == 's'
1563 && self.peek_char(1) == Some('\'')
1564 {
1565 self.advance(); // consume 's'
1566 return self.parse_substitution(start);
1567 } else if !follows_sigil_prefix
1568 && !self.after_arrow
1569 && self.hash_brace_depth == 0
1570 && ch == 'y'
1571 && self.peek_char(1) == Some('\'')
1572 {
1573 self.advance(); // consume 'y'
1574 return self.parse_transliteration(start);
1575 } else if !follows_sigil_prefix
1576 && !self.after_arrow
1577 && self.hash_brace_depth == 0
1578 && ch == 't'
1579 && self.peek_char(1) == Some('r')
1580 && self.peek_char(2) == Some('\'')
1581 {
1582 self.advance(); // consume 't'
1583 self.advance(); // consume 'r'
1584 return self.parse_transliteration(start);
1585 }
1586
1587 // Fast ASCII path for identifier continuation.
1588 while self.position < len {
1589 let byte = bytes[self.position];
1590 if byte == b'\'' {
1591 if is_quote_op_word_prefix(&bytes[start..self.position])
1592 || !self.apostrophe_starts_legacy_package_segment(self.position)
1593 {
1594 // Keep apostrophe for quote/string parsing in cases like q'...'
1595 // and split' ', while still accepting Foo'Bar package spelling.
1596 break;
1597 }
1598 self.position += 1;
1599 continue;
1600 }
1601
1602 if byte.is_ascii_alphanumeric() || byte == b'_' {
1603 self.position += 1;
1604 continue;
1605 }
1606
1607 if byte < 128 {
1608 break;
1609 }
1610
1611 if let Some(ch) = self.current_char()
1612 && is_perl_identifier_continue(ch)
1613 {
1614 self.advance();
1615 continue;
1616 }
1617 break;
1618 }
1619 // Handle package-qualified identifiers like Foo::bar.
1620 while self.config.max_lookahead >= 1
1621 && self.position + 1 < len
1622 && bytes[self.position] == b':'
1623 && bytes[self.position + 1] == b':'
1624 {
1625 self.position += 2; // consume '::'
1626
1627 // consume following identifier segment if present
1628 let Some(ch) = self.current_char() else {
1629 break;
1630 };
1631 if !is_perl_identifier_start(ch) {
1632 break;
1633 }
1634 self.advance();
1635 while self.position < len {
1636 let byte = bytes[self.position];
1637 if byte == b'\'' {
1638 if !self.apostrophe_starts_legacy_package_segment(self.position) {
1639 break;
1640 }
1641 self.position += 1;
1642 continue;
1643 }
1644
1645 if byte.is_ascii_alphanumeric() || byte == b'_' {
1646 self.position += 1;
1647 continue;
1648 }
1649 if byte < 128 {
1650 break;
1651 }
1652 if let Some(ch) = self.current_char()
1653 && is_perl_identifier_continue(ch)
1654 {
1655 self.advance();
1656 continue;
1657 }
1658 break;
1659 }
1660 }
1661
1662 let text = &self.input[start..self.position];
1663
1664 // Check for __DATA__ and __END__ markers using exact match
1665 // Only recognize these in code channel, not inside data/format sections or heredocs
1666 let in_code_channel =
1667 !matches!(self.mode, LexerMode::InDataSection | LexerMode::InFormatBody)
1668 && self.pending_heredocs.is_empty();
1669
1670 let marker = if in_code_channel {
1671 if text == "__DATA__" {
1672 Some("__DATA__")
1673 } else if text == "__END__" {
1674 Some("__END__")
1675 } else {
1676 None
1677 }
1678 } else {
1679 None
1680 };
1681
1682 if let Some(marker_text) = marker {
1683 // These must be at the beginning of a line
1684 // Use the after_newline flag to determine if we're at line start
1685 if self.after_newline {
1686 // Check if rest of line is only whitespace
1687 // Only treat as data marker if line has no trailing junk
1688 if Self::trailing_ws_only(self.input_bytes, self.position) {
1689 // Consume the rest of the line (the marker line)
1690 while self.position < self.input.len()
1691 && self.input_bytes[self.position] != b'\n'
1692 && self.input_bytes[self.position] != b'\r'
1693 {
1694 self.advance();
1695 }
1696 self.consume_newline();
1697
1698 // Switch to data section mode
1699 self.mode = LexerMode::InDataSection;
1700
1701 return Some(Token {
1702 token_type: TokenType::DataMarker(Arc::from(marker_text)),
1703 text: Arc::from(marker_text),
1704 start,
1705 end: self.position,
1706 });
1707 }
1708 }
1709 }
1710
1711 // Check for substitution/transliteration operators
1712 // Skip if after '->' -- these are method names, not operators.
1713 #[allow(clippy::collapsible_if)]
1714 if !self.after_sub
1715 && !self.after_arrow
1716 && !follows_sigil_prefix
1717 && self.hash_brace_depth == 0
1718 && matches!(text, "s" | "tr" | "y")
1719 {
1720 let immediate = self.current_char();
1721 let (candidate, char_after_next, has_whitespace) =
1722 if immediate.is_some_and(|c| c.is_whitespace()) {
1723 let (nc, ca) = self.peek_nonspace_and_following();
1724 (nc, ca, true)
1725 } else {
1726 let following = immediate.and_then(|c| {
1727 let j = self.position + c.len_utf8();
1728 self.input.get(j..).and_then(|s| s.chars().next())
1729 });
1730 (immediate, following, false)
1731 };
1732
1733 if let Some(next) = candidate {
1734 // `s => 1` should remain a fat-arrow hash key, not quote op.
1735 let is_fat_arrow = next == '=' && char_after_next == Some('>');
1736 let is_filetest_s = text == "s"
1737 && self.input.get(..start).is_some_and(|prefix| prefix.ends_with('-'));
1738 let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
1739 let is_quote_char = matches!(next, '\'' | '"') && text != "s";
1740 let transliteration_allows_whitespace = text == "tr" || text == "y";
1741 let substitution_disallows_whitespace = text == "s" && has_whitespace;
1742 let is_valid_delim = Self::is_quote_delim(next)
1743 && !is_fat_arrow
1744 && !is_filetest_s
1745 && !substitution_disallows_whitespace
1746 && (!has_whitespace
1747 || is_paired_delim
1748 || is_quote_char
1749 || transliteration_allows_whitespace);
1750
1751 if is_valid_delim {
1752 match text {
1753 "s" => return self.parse_substitution(start),
1754 "tr" | "y" => return self.parse_transliteration(start),
1755 unexpected => {
1756 return Some(Token {
1757 token_type: TokenType::Error(Arc::from(format!(
1758 "Unexpected substitution operator '{}': expected 's', 'tr', or 'y' at position {}",
1759 unexpected, start
1760 ))),
1761 text: Arc::from(unexpected),
1762 start,
1763 end: self.position,
1764 });
1765 }
1766 }
1767 }
1768 }
1769 }
1770
1771 let token_type = if is_keyword_fast(text) {
1772 // Check for special keywords that affect lexer mode
1773 match text {
1774 "if" | "unless" | "while" | "until" | "for" | "foreach" | "grep" | "map"
1775 | "sort" | "split" | "and" | "or" | "xor" | "not"
1776 // These keywords introduce an expression, so a following `/` is a
1777 // regex, not division. `return /re/`, `die /re/`, `warn /re/`,
1778 // `do /file/`, and `eval /re/` are all valid Perl.
1779 | "return" | "die" | "warn" | "do" | "eval" => {
1780 self.mode = LexerMode::ExpectTerm;
1781 }
1782 "sub" => {
1783 self.after_sub = true;
1784 self.mode = LexerMode::ExpectTerm;
1785 }
1786 // Quote operators expect a delimiter next.
1787 // Skip if after '->' -- these are method names, not operators.
1788 // Inside hash subscript braces, regex-like operators stay bareword
1789 // keys (`@h{m, s}`), but q-family operators can still introduce real
1790 // quote expressions in slices (`@h{qw/a b/}`).
1791 op if !self.after_sub
1792 && !self.after_arrow
1793 && !follows_sigil_prefix
1794 && quote_handler::is_quote_operator(op)
1795 && (self.hash_brace_depth == 0
1796 || matches!(op, "q" | "qq" | "qw" | "qr" | "qx")) =>
1797 {
1798 // Perl allows whitespace between a quote-like operator and its delimiter,
1799 // but ONLY for paired delimiters (s { ... } { ... }g).
1800 // For non-paired delimiters (s/foo/bar/, s,foo,bar,), the delimiter
1801 // must be immediately adjacent — otherwise `s $foo` would wrongly
1802 // treat `$` as a delimiter instead of being a bareword `s` followed
1803 // by a scalar variable.
1804 //
1805 // Strategy:
1806 // 1. Check the immediately-adjacent char first (no whitespace skip).
1807 // If it is a valid delimiter → any non-alnum, non-whitespace char.
1808 // 2. If the adjacent char is whitespace, peek past it.
1809 // Only accept PAIRED delimiters ({, [, (, <) in that case.
1810 let immediate = self.current_char();
1811 let (candidate, char_after_next, has_whitespace) =
1812 if immediate.is_some_and(|c| c.is_whitespace()) {
1813 // There is whitespace — peek past it
1814 let (nc, ca) = self.peek_nonspace_and_following();
1815 (nc, ca, true)
1816 } else {
1817 // No whitespace — use immediate char
1818 let following = immediate.and_then(|c| {
1819 let j = self.position + c.len_utf8();
1820 self.input.get(j..).and_then(|s| s.chars().next())
1821 });
1822 (immediate, following, false)
1823 };
1824
1825 if let Some(next) = candidate {
1826 // Fat-arrow autoquoting: `s => value` — `=` followed by `>` is '=>',
1827 // not a valid substitution delimiter. Treat as identifier.
1828 let is_fat_arrow = next == '=' && char_after_next == Some('>');
1829 let is_filetest_s =
1830 op == "s" && self.input.get(..start).is_some_and(|prefix| {
1831 prefix.ends_with('-')
1832 });
1833
1834 // When whitespace precedes the delimiter, only unambiguous
1835 // delimiters are accepted:
1836 // - Paired delimiters ({, [, (, <) are always safe.
1837 // - ' and " are safe for all operators EXCEPT `s` — `-s 'filename'`
1838 // is a valid file-size filetest and must not be treated as a
1839 // substitution start. All other operators (qw, q, qq, qr, qx, m,
1840 // tr, y) have no corresponding file-test operator.
1841 // - / is safe for non-substitution quote operators; `qw /a b/` and
1842 // `m /re/` are common, while `s /foo/bar/` remains ambiguous with
1843 // the file-size test shape and stays rejected here.
1844 // - Non-paired, non-quote chars ($, @, ,, etc.) remain rejected.
1845 let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
1846 let is_quote_char = matches!(next, '\'' | '"') && op != "s";
1847 let is_spaced_slash_delim = next == '/' && op != "s";
1848 let is_hash_subscript_bare_key_boundary =
1849 self.hash_brace_depth > 0 && matches!(next, ',' | '}');
1850 let is_valid_delim = Self::is_quote_delim(next)
1851 && !is_fat_arrow
1852 && !is_filetest_s
1853 && !is_hash_subscript_bare_key_boundary
1854 && (!has_whitespace
1855 || is_paired_delim
1856 || is_quote_char
1857 || is_spaced_slash_delim);
1858
1859 if is_valid_delim {
1860 self.mode = LexerMode::ExpectDelimiter;
1861 self.current_quote_op = Some(quote_handler::QuoteOperatorInfo {
1862 operator: op.to_string(),
1863 delimiter: '\0', // Will be set when we see the delimiter
1864 start_pos: start,
1865 });
1866
1867 // Don't return a keyword token - continue to parse the delimiter
1868 // Skip any whitespace between operator and delimiter
1869 while let Some(ch) = self.current_char() {
1870 if ch.is_whitespace() {
1871 self.advance();
1872 } else {
1873 break;
1874 }
1875 }
1876
1877 // Get the delimiter
1878 #[allow(clippy::collapsible_if)]
1879 if let Some(delim) = self.current_char() {
1880 if !delim.is_alphanumeric() {
1881 self.advance();
1882 if let Some(ref mut info) = self.current_quote_op {
1883 info.delimiter = delim;
1884 }
1885 // Parse the quote operator content and return the complete token
1886 return self.parse_quote_operator(delim);
1887 }
1888 }
1889 } else {
1890 // Not a quote operator here → treat as IDENTIFIER
1891 self.current_quote_op = None;
1892 self.mode = LexerMode::ExpectOperator;
1893 return Some(Token {
1894 token_type: TokenType::Identifier(Arc::from(text)),
1895 start,
1896 end: self.position,
1897 text: Arc::from(text),
1898 });
1899 }
1900 } else {
1901 // End-of-input after the word → also treat as IDENTIFIER
1902 self.current_quote_op = None;
1903 self.mode = LexerMode::ExpectOperator;
1904 return Some(Token {
1905 token_type: TokenType::Identifier(Arc::from(text)),
1906 start,
1907 end: self.position,
1908 text: Arc::from(text),
1909 });
1910 }
1911 // If we get here but haven't returned, something went wrong
1912 // Fall through to treat as identifier
1913 self.current_quote_op = None;
1914 self.mode = LexerMode::ExpectOperator;
1915 return Some(Token {
1916 token_type: TokenType::Identifier(Arc::from(text)),
1917 start,
1918 end: self.position,
1919 text: Arc::from(text),
1920 });
1921 }
1922 // Format declarations need special handling
1923 "format" => {
1924 // We'll need to check for the = after the format name
1925 // For now, just mark that we saw format
1926 }
1927 _ if is_builtin_function(text) => {
1928 // Bare builtins are term-introducing in Perl.
1929 self.mode = LexerMode::ExpectTerm;
1930 }
1931 _ => {
1932 self.mode = LexerMode::ExpectOperator;
1933 }
1934 }
1935 TokenType::Keyword(Arc::from(text))
1936 } else {
1937 // Mirror parser bare-builtin handling so `/` after builtins like
1938 // `join` or `print` is lexed as a regex term, not division.
1939 if is_builtin_function(text) {
1940 self.mode = LexerMode::ExpectTerm;
1941 } else {
1942 self.mode = LexerMode::ExpectOperator;
1943 }
1944 TokenType::Identifier(Arc::from(text))
1945 };
1946
1947 self.after_arrow = false;
1948 // A keyword/identifier is not a variable; `{` after it is a block opener.
1949 self.after_var_subscript = false;
1950 // hash_brace_depth is managed by { and } handlers, not cleared per-token
1951 Some(Token { token_type, text: Arc::from(text), start, end: self.position })
1952 } else {
1953 None
1954 }
1955 }
1956
1957 /// Parse data section body - consumes everything to EOF
1958 fn parse_data_body(&mut self) -> Option<Token> {
1959 if self.position >= self.input.len() {
1960 // Already at EOF
1961 self.mode = LexerMode::ExpectTerm;
1962 return Some(Token {
1963 token_type: TokenType::EOF,
1964 text: Arc::from(""),
1965 start: self.position,
1966 end: self.position,
1967 });
1968 }
1969
1970 let start = self.position;
1971 // Consume everything to EOF
1972 let body = &self.input[self.position..];
1973 self.position = self.input.len();
1974
1975 // Reset mode for next parse (though we're at EOF)
1976 self.mode = LexerMode::ExpectTerm;
1977
1978 Some(Token {
1979 token_type: TokenType::DataBody(Arc::from(body)),
1980 text: Arc::from(body),
1981 start,
1982 end: self.position,
1983 })
1984 }
1985
1986 /// Parse format body - consumes until a line with just a dot
1987 fn parse_format_body(&mut self) -> Option<Token> {
1988 let start = self.position;
1989 let mut body = String::new();
1990 let mut line_start = true;
1991
1992 while self.position < self.input.len() {
1993 // Check if we're at the start of a line and the next char is a dot
1994 if line_start && self.current_char() == Some('.') {
1995 // Check if this line contains only a dot
1996 let mut peek_pos = self.position + 1;
1997 let mut found_terminator = true;
1998
1999 // Skip any trailing whitespace on the dot line
2000 while peek_pos < self.input.len() {
2001 match self.input_bytes[peek_pos] {
2002 b' ' | b'\t' | b'\r' => peek_pos += 1,
2003 b'\n' => break,
2004 _ => {
2005 found_terminator = false;
2006 break;
2007 }
2008 }
2009 }
2010
2011 if found_terminator {
2012 // We found the terminating dot, consume it
2013 self.position = peek_pos;
2014 if self.position < self.input.len() && self.input_bytes[self.position] == b'\n'
2015 {
2016 self.position += 1;
2017 }
2018
2019 // Switch back to normal mode
2020 self.mode = LexerMode::ExpectTerm;
2021
2022 return Some(Token {
2023 token_type: TokenType::FormatBody(Arc::from(body.clone())),
2024 text: Arc::from(body),
2025 start,
2026 end: self.position,
2027 });
2028 }
2029 }
2030
2031 // Not a terminator, consume the character
2032 match self.current_char() {
2033 Some(ch) => {
2034 body.push(ch);
2035 self.advance();
2036
2037 // Track if we're at the start of a line
2038 line_start = ch == '\n';
2039 }
2040 None => {
2041 // Reached EOF without finding terminator
2042 break;
2043 }
2044 }
2045 }
2046
2047 // If we reach here, we didn't find a terminator
2048 self.mode = LexerMode::ExpectTerm;
2049 Some(Token {
2050 token_type: TokenType::Error(Arc::from("Unterminated format body")),
2051 text: Arc::from(body),
2052 start,
2053 end: self.position,
2054 })
2055 }
2056
2057 fn try_operator(&mut self) -> Option<Token> {
2058 // Skip operator parsing if we're expecting a delimiter for a quote operator
2059 if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2060 return None;
2061 }
2062
2063 let start = self.position;
2064 let ch = self.current_char()?;
2065
2066 // ═══════════════════════════════════════════════════════════════════════
2067 // SLASH DISAMBIGUATION STRATEGY (Issue #422)
2068 // ═══════════════════════════════════════════════════════════════════════
2069 //
2070 // Perl's `/` character is ambiguous:
2071 // - Division operator: `$x / 2`
2072 // - Regex delimiter: `/pattern/`
2073 // - Defined-or operator: `$x // $y`
2074 //
2075 // **Disambiguation Strategy (Context-Aware Heuristics):**
2076 //
2077 // 1. **Mode-Based Decision (Primary)**:
2078 // - `LexerMode::ExpectTerm` → `/` starts a regex
2079 // Examples: `if (/pattern/)`, `=~ /test/`, `( /regex/`
2080 // - `LexerMode::ExpectOperator` → `/` is division or `//`
2081 // Examples: `$x / 2`, `$x // $y`, `) / 3`
2082 //
2083 // 2. **Context Heuristics (Secondary - Implicit in Mode)**:
2084 // Mode is set based on previous token:
2085 // - After identifier/number/closing paren → ExpectOperator → division
2086 // - After operator/keyword/opening paren → ExpectTerm → regex
2087 //
2088 // 3. **Budget Protection**:
2089 // - Regex parsing has a parse-step budget and byte budget
2090 // - Budget exceeded → emit UnknownRest token (graceful degradation)
2091 // - See `parse_regex()` and `budget_guard()` for implementation
2092 //
2093 // 4. **Performance Characteristics**:
2094 // - Single-pass: O(1) decision based on mode flag
2095 // - No backtracking: Mode updated after each token
2096 // - Optimized: Byte-level operations for common cases
2097 //
2098 // **Metrics & Monitoring**:
2099 // - Budget exceeded events tracked via UnknownRest token emission
2100 // - LSP diagnostics generated for truncated regexes
2101 // - Test coverage: lexer_slash_timeout_tests.rs (21 test cases)
2102 //
2103 // ═══════════════════════════════════════════════════════════════════════
2104
2105 if ch == '/' {
2106 if self.mode == LexerMode::ExpectTerm {
2107 // Mode indicates we're expecting a term → `/` starts a regex
2108 // Examples: `if (/pattern/)`, `=~ /test/`, `while (/match/)`
2109 return self.parse_regex(start);
2110 } else {
2111 // Mode indicates we're expecting an operator → `/` is division or `//`
2112 // Examples: `$x / 2`, `$x // $y`, `10 / 3`
2113 self.advance();
2114 // Check for // or //= using byte-level operations for speed
2115 if self.peek_byte(0) == Some(b'/') {
2116 self.position += 1; // consume second / directly
2117 if self.peek_byte(0) == Some(b'=') {
2118 self.position += 1; // consume = directly
2119 let text = &self.input[start..self.position];
2120 self.mode = LexerMode::ExpectTerm;
2121 return Some(Token {
2122 token_type: TokenType::Operator(Arc::from(text)),
2123 text: Arc::from(text),
2124 start,
2125 end: self.position,
2126 });
2127 } else {
2128 // Use cached string for common "//" operator
2129 self.mode = LexerMode::ExpectTerm;
2130 return Some(Token {
2131 token_type: TokenType::Operator(Arc::from("//")),
2132 text: Arc::from("//"),
2133 start,
2134 end: self.position,
2135 });
2136 }
2137 } else if self.position < self.input_bytes.len()
2138 && self.input_bytes[self.position] == b'='
2139 {
2140 // /= division-assign operator
2141 self.position += 1; // consume =
2142 self.mode = LexerMode::ExpectTerm;
2143 return Some(Token {
2144 token_type: TokenType::Operator(Arc::from("/=")),
2145 text: Arc::from("/="),
2146 start,
2147 end: self.position,
2148 });
2149 } else {
2150 // Use cached string for common "/" division
2151 self.mode = LexerMode::ExpectTerm;
2152 return Some(Token {
2153 token_type: TokenType::Division,
2154 text: Arc::from("/"),
2155 start,
2156 end: self.position,
2157 });
2158 }
2159 }
2160 }
2161
2162 // Handle other operators - simplified
2163 match ch {
2164 '.' => {
2165 // Check if it's a decimal number like .5 -- but only when we
2166 // expect a term. In operator position `.5` is concatenation
2167 // of the bareword/number on the left with the number `5`.
2168 if self.mode != LexerMode::ExpectOperator
2169 && self.peek_char(1).is_some_and(|c| c.is_ascii_digit())
2170 {
2171 return self.parse_decimal_number(start);
2172 }
2173 self.advance();
2174 // Check for compound operators
2175 #[allow(clippy::collapsible_if)]
2176 if let Some(next) = self.current_char() {
2177 if is_compound_operator(ch, next) {
2178 self.advance();
2179
2180 // Check for three-character operators like **=, <<=, >>=
2181 if self.position < self.input.len() {
2182 let third = self.current_char();
2183 // Check for three-character operators
2184 if matches!(
2185 (ch, next, third),
2186 ('*', '*', Some('='))
2187 | ('<', '<', Some('='))
2188 | ('>', '>', Some('='))
2189 | ('&', '&', Some('='))
2190 | ('|', '|', Some('='))
2191 | ('/', '/', Some('='))
2192 ) {
2193 self.advance(); // consume the =
2194 } else if ch == '<' && next == '=' && third == Some('>') {
2195 self.advance(); // consume the >
2196 // Special case: <=> spaceship operator
2197 } else if ch == '.' && next == '.' && third == Some('.') {
2198 self.advance(); // consume the third .
2199 }
2200 }
2201 }
2202 }
2203 }
2204 '+' | '-' | '*' | '%' | '&' | '|' | '^' | '~' | '!' | '=' | '<' | '>' | ':' | '?'
2205 | '\\' => {
2206 self.advance();
2207 // Check for compound operators
2208 #[allow(clippy::collapsible_if)]
2209 if let Some(next) = self.current_char() {
2210 if is_compound_operator(ch, next) {
2211 self.advance();
2212
2213 // Check for three-character operators like **=, <<=, >>=
2214 if self.position < self.input.len() {
2215 let third = self.current_char();
2216 // Check for three-character operators
2217 if matches!(
2218 (ch, next, third),
2219 ('*', '*', Some('='))
2220 | ('<', '<', Some('='))
2221 | ('>', '>', Some('='))
2222 | ('&', '&', Some('='))
2223 | ('|', '|', Some('='))
2224 | ('/', '/', Some('='))
2225 ) {
2226 self.advance(); // consume the =
2227 } else if ch == '<' && next == '=' && third == Some('>') {
2228 self.advance(); // consume the >
2229 // Special case: <=> spaceship operator
2230 }
2231 }
2232 }
2233 }
2234 }
2235 _ => return None,
2236 }
2237
2238 let text = &self.input[start..self.position];
2239 // Operator ends prototype window (e.g. `:` for attributes)
2240 self.after_sub = false;
2241 // Track whether this operator is '->' for method name disambiguation
2242 self.after_arrow = text == "->";
2243 // Any operator token ends the "just saw a variable" window; `{` after
2244 // an operator is not a hash subscript (e.g. `foo() {`, `+ {`, etc.).
2245 self.after_var_subscript = false;
2246 // Postfix ++ and -- complete a term expression, so next token is an operator
2247 // (e.g., "$x++ / 2" → / is division, not regex)
2248 if (text == "++" || text == "--") && self.mode == LexerMode::ExpectOperator {
2249 // Postfix: stay in ExpectOperator
2250 } else {
2251 self.mode = LexerMode::ExpectTerm;
2252 }
2253
2254 Some(Token {
2255 token_type: TokenType::Operator(Arc::from(text)),
2256 text: Arc::from(text),
2257 start,
2258 end: self.position,
2259 })
2260 }
2261
2262 fn try_delimiter(&mut self) -> Option<Token> {
2263 let start = self.position;
2264 let ch = self.current_char()?;
2265
2266 // If we're expecting a delimiter for a quote operator, handle it specially
2267 if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2268 // Accept any non-alphanumeric character as a delimiter
2269 if !ch.is_alphanumeric() && !ch.is_whitespace() {
2270 self.advance();
2271 if let Some(ref mut info) = self.current_quote_op {
2272 info.delimiter = ch;
2273 }
2274 // Now parse the quote operator content
2275 return self.parse_quote_operator(ch);
2276 }
2277 }
2278
2279 match ch {
2280 '(' => {
2281 // Check if this is a quote operator delimiter
2282 if matches!(self.mode, LexerMode::ExpectDelimiter)
2283 && self.current_quote_op.is_some()
2284 {
2285 self.advance();
2286 if let Some(ref mut info) = self.current_quote_op {
2287 info.delimiter = ch;
2288 }
2289 return self.parse_quote_operator(ch);
2290 }
2291
2292 self.advance();
2293 if self.after_sub {
2294 // Promote after_sub to in_prototype now that we see '('
2295 self.in_prototype = true;
2296 self.after_sub = false;
2297 self.prototype_depth = 1;
2298 } else if self.in_prototype {
2299 self.prototype_depth += 1;
2300 }
2301 self.paren_depth += 1;
2302 self.after_var_subscript = false;
2303 self.mode = LexerMode::ExpectTerm;
2304 Some(Token {
2305 token_type: TokenType::LeftParen,
2306 text: Arc::from("("),
2307 start,
2308 end: self.position,
2309 })
2310 }
2311 ')' => {
2312 self.advance();
2313 if self.in_prototype && self.prototype_depth > 0 {
2314 self.prototype_depth -= 1;
2315 if self.prototype_depth == 0 {
2316 self.in_prototype = false;
2317 }
2318 }
2319 self.after_arrow = false;
2320 self.paren_depth = self.paren_depth.saturating_sub(1);
2321 // A closing paren ends any var-subscript context: `if ($var)` should
2322 // NOT leave after_var_subscript set, otherwise the following `{` would
2323 // incorrectly increment hash_brace_depth and suppress regex operators
2324 // inside the block body (issue #2844).
2325 self.after_var_subscript = false;
2326 self.mode = LexerMode::ExpectOperator;
2327 Some(Token {
2328 token_type: TokenType::RightParen,
2329 text: Arc::from(")"),
2330 start,
2331 end: self.position,
2332 })
2333 }
2334 ';' => {
2335 self.advance();
2336 // Semicolon ends prototype window (forward declaration)
2337 self.after_sub = false;
2338 // Semicolon is a statement boundary — any pending method-call chain is over.
2339 self.after_arrow = false;
2340 self.after_var_subscript = false;
2341 self.mode = LexerMode::ExpectTerm;
2342 Some(Token {
2343 token_type: TokenType::Semicolon,
2344 text: Arc::from(";"),
2345 start,
2346 end: self.position,
2347 })
2348 }
2349 ',' => {
2350 self.advance();
2351 self.after_var_subscript = false;
2352 self.mode = LexerMode::ExpectTerm;
2353 Some(Token {
2354 token_type: TokenType::Comma,
2355 text: Arc::from(","),
2356 start,
2357 end: self.position,
2358 })
2359 }
2360 '[' => {
2361 self.advance();
2362 self.after_var_subscript = false;
2363 self.mode = LexerMode::ExpectTerm;
2364 Some(Token {
2365 token_type: TokenType::LeftBracket,
2366 text: Arc::from("["),
2367 start,
2368 end: self.position,
2369 })
2370 }
2371 ']' => {
2372 self.advance();
2373 // A closing `]` from an array subscript leaves us in a state where
2374 // a `{` immediately following is a hash subscript — e.g. `$arr[$i]{key}`.
2375 // Set after_var_subscript so the `{` handler recognises it as such.
2376 // This mirrors the `}` handler's behavior when closing a hash subscript.
2377 self.after_var_subscript = true;
2378 self.mode = LexerMode::ExpectOperator;
2379 Some(Token {
2380 token_type: TokenType::RightBracket,
2381 text: Arc::from("]"),
2382 start,
2383 end: self.position,
2384 })
2385 }
2386 '{' => {
2387 self.advance();
2388 // Opening brace ends prototype window — no prototype follows
2389 self.after_sub = false;
2390 // `{` is a hash/slice subscript opener only when it immediately follows
2391 // a variable token ($x, @x, %x) — tracked by `after_var_subscript`.
2392 // This is narrower than the old `mode == ExpectOperator` check, which
2393 // incorrectly incremented depth for block-opening braces after `sub foo`,
2394 // `if (cond)`, `else`, `while (cond)`, etc., causing quote-op suppression
2395 // inside those block bodies and breaking m//, s///, qr//, tr/// etc.
2396 if self.after_var_subscript {
2397 self.hash_brace_depth = self.hash_brace_depth.saturating_add(1);
2398 }
2399 self.after_var_subscript = false;
2400 self.mode = LexerMode::ExpectTerm;
2401 Some(Token {
2402 token_type: TokenType::LeftBrace,
2403 text: Arc::from("{"),
2404 start,
2405 end: self.position,
2406 })
2407 }
2408 '}' => {
2409 self.advance();
2410 self.after_arrow = false;
2411 // Decrement hash subscript brace depth only if we were inside one.
2412 // If depth > 0, this closes a hash subscript; enable chained subscripts
2413 // like $h{a}{b} by setting after_var_subscript so the next `{` is
2414 // recognized as another subscript opener.
2415 if self.hash_brace_depth > 0 {
2416 self.hash_brace_depth -= 1;
2417 // The subscript value is now the "variable" for a chained subscript.
2418 self.after_var_subscript = true;
2419 } else {
2420 // Block-close `}` — no subscript follows
2421 self.after_var_subscript = false;
2422 }
2423 self.mode = LexerMode::ExpectOperator;
2424 Some(Token {
2425 token_type: TokenType::RightBrace,
2426 text: Arc::from("}"),
2427 start,
2428 end: self.position,
2429 })
2430 }
2431 '#' => {
2432 // Only treat as delimiter in ExpectDelimiter mode
2433 if matches!(self.mode, LexerMode::ExpectDelimiter) {
2434 self.advance();
2435 // Reset mode after consuming delimiter
2436 self.mode = LexerMode::ExpectTerm;
2437 Some(Token {
2438 token_type: TokenType::Operator(Arc::from("#")),
2439 text: Arc::from("#"),
2440 start,
2441 end: self.position,
2442 })
2443 } else {
2444 None
2445 }
2446 }
2447 _ => None,
2448 }
2449 }
2450
2451 fn parse_double_quoted_string(&mut self, start: usize) -> Option<Token> {
2452 self.advance(); // Skip opening quote
2453 let mut parts = Vec::new();
2454 let mut current_literal = String::new();
2455 let mut last_pos = self.position;
2456
2457 while let Some(ch) = self.current_char() {
2458 match ch {
2459 '"' => {
2460 self.advance();
2461 if !current_literal.is_empty() {
2462 parts.push(StringPart::Literal(Arc::from(current_literal)));
2463 }
2464
2465 let text = &self.input[start..self.position];
2466 self.mode = LexerMode::ExpectOperator;
2467
2468 return Some(Token {
2469 token_type: if parts.is_empty() {
2470 TokenType::StringLiteral
2471 } else {
2472 TokenType::InterpolatedString(parts)
2473 },
2474 text: Arc::from(text),
2475 start,
2476 end: self.position,
2477 });
2478 }
2479 '\\' => {
2480 self.advance();
2481 if let Some(escaped) = self.current_char() {
2482 // Optimize by reserving space to avoid frequent reallocations
2483 if current_literal.capacity() == 0 {
2484 current_literal.reserve(32);
2485 }
2486 current_literal.push('\\');
2487 current_literal.push(escaped);
2488 self.advance();
2489 }
2490 }
2491 '$' if self.config.parse_interpolation => {
2492 // Handle variable interpolation - avoid unnecessary clone
2493 if !current_literal.is_empty() {
2494 parts.push(StringPart::Literal(Arc::from(current_literal)));
2495 current_literal = String::new(); // Clear without cloning
2496 }
2497
2498 let part_start = self.position;
2499 self.advance();
2500 match self.current_char() {
2501 Some('{') => {
2502 let _ = self.consume_balanced_segment_in_string('{', '}', '"');
2503 parts.push(StringPart::Expression(Arc::from(
2504 &self.input[part_start..self.position],
2505 )));
2506 }
2507 Some(ch) if is_perl_identifier_start(ch) => {
2508 let var_start = self.position;
2509
2510 // Fast path for ASCII identifier continuation
2511 while self.position < self.input_bytes.len() {
2512 let byte = self.input_bytes[self.position];
2513 if byte.is_ascii_alphanumeric() || byte == b'_' {
2514 self.position += 1;
2515 } else if byte >= 128 {
2516 // Only use UTF-8 parsing for non-ASCII
2517 if let Some(ch) = self.current_char() {
2518 if is_perl_identifier_continue(ch) {
2519 self.advance();
2520 } else {
2521 break;
2522 }
2523 } else {
2524 break;
2525 }
2526 } else {
2527 break;
2528 }
2529 }
2530
2531 if self.position > var_start {
2532 let var_name = &self.input[part_start..self.position];
2533 parts.push(StringPart::Variable(Arc::from(var_name)));
2534
2535 if self.matches_bytes(b"->") {
2536 let tail_start = self.position;
2537 self.advance();
2538 self.advance();
2539
2540 match self.current_char() {
2541 Some('[') => {
2542 let _ = self
2543 .consume_balanced_segment_in_string('[', ']', '"');
2544 parts.push(StringPart::MethodCall(Arc::from(
2545 &self.input[tail_start..self.position],
2546 )));
2547 }
2548 Some('{') => {
2549 let _ = self
2550 .consume_balanced_segment_in_string('{', '}', '"');
2551 parts.push(StringPart::MethodCall(Arc::from(
2552 &self.input[tail_start..self.position],
2553 )));
2554 }
2555 Some('(') => {
2556 let _ = self
2557 .consume_balanced_segment_in_string('(', ')', '"');
2558 parts.push(StringPart::MethodCall(Arc::from(
2559 &self.input[tail_start..self.position],
2560 )));
2561 }
2562 Some(ch) if is_perl_identifier_start(ch) => {
2563 while self.position < self.input_bytes.len() {
2564 let byte = self.input_bytes[self.position];
2565 if byte.is_ascii_alphanumeric() || byte == b'_' {
2566 self.position += 1;
2567 } else if byte >= 128 {
2568 if let Some(ch) = self.current_char() {
2569 if is_perl_identifier_continue(ch) {
2570 self.advance();
2571 } else {
2572 break;
2573 }
2574 } else {
2575 break;
2576 }
2577 } else {
2578 break;
2579 }
2580 }
2581 if self.current_char() == Some('(') {
2582 let _ = self.consume_balanced_segment_in_string(
2583 '(', ')', '"',
2584 );
2585 }
2586 parts.push(StringPart::MethodCall(Arc::from(
2587 &self.input[tail_start..self.position],
2588 )));
2589 }
2590 _ => {
2591 parts.push(StringPart::MethodCall(Arc::from(
2592 &self.input[tail_start..self.position],
2593 )));
2594 }
2595 }
2596 } else if self.current_char() == Some('[') {
2597 let tail_start = self.position;
2598 let _ = self.consume_balanced_segment_in_string('[', ']', '"');
2599 parts.push(StringPart::ArraySlice(Arc::from(
2600 &self.input[tail_start..self.position],
2601 )));
2602 } else if self.current_char() == Some('{') {
2603 let tail_start = self.position;
2604 let _ = self.consume_balanced_segment_in_string('{', '}', '"');
2605 parts.push(StringPart::Expression(Arc::from(
2606 &self.input[tail_start..self.position],
2607 )));
2608 }
2609 }
2610 }
2611 _ => {}
2612 }
2613 }
2614 _ => {
2615 // Optimize string building with better capacity management
2616 if current_literal.capacity() == 0 {
2617 current_literal.reserve(32);
2618 }
2619 current_literal.push(ch);
2620 self.advance();
2621 }
2622 }
2623
2624 // Safety check: ensure we're making progress
2625 if self.position == last_pos {
2626 break;
2627 }
2628 last_pos = self.position;
2629 }
2630
2631 Some(self.unterminated_string_error(start))
2632 }
2633
2634 fn parse_single_quoted_string(&mut self, start: usize) -> Option<Token> {
2635 self.advance(); // Skip opening quote
2636
2637 let mut last_pos = self.position;
2638
2639 while let Some(ch) = self.current_char() {
2640 match ch {
2641 '\'' => {
2642 self.advance();
2643 let text = &self.input[start..self.position];
2644 self.mode = LexerMode::ExpectOperator;
2645
2646 return Some(Token {
2647 token_type: TokenType::StringLiteral,
2648 text: Arc::from(text),
2649 start,
2650 end: self.position,
2651 });
2652 }
2653 '\\' => {
2654 self.advance();
2655 if self.current_char() == Some('\'') || self.current_char() == Some('\\') {
2656 self.advance();
2657 }
2658 }
2659 _ => self.advance(),
2660 }
2661
2662 // Safety check: ensure we're making progress
2663 if self.position == last_pos {
2664 break;
2665 }
2666 last_pos = self.position;
2667 }
2668
2669 Some(self.unterminated_string_error(start))
2670 }
2671
2672 fn parse_backtick_string(&mut self, start: usize) -> Option<Token> {
2673 self.advance(); // Skip opening backtick
2674
2675 let mut last_pos = self.position;
2676
2677 while let Some(ch) = self.current_char() {
2678 match ch {
2679 '`' => {
2680 self.advance();
2681 let text = &self.input[start..self.position];
2682 self.mode = LexerMode::ExpectOperator;
2683
2684 return Some(Token {
2685 token_type: TokenType::QuoteCommand,
2686 text: Arc::from(text),
2687 start,
2688 end: self.position,
2689 });
2690 }
2691 '\\' => {
2692 self.advance();
2693 if self.current_char().is_some() {
2694 self.advance();
2695 }
2696 }
2697 _ => self.advance(),
2698 }
2699
2700 // Safety check: ensure we're making progress
2701 if self.position == last_pos {
2702 break;
2703 }
2704 last_pos = self.position;
2705 }
2706
2707 Some(self.unterminated_string_error(start))
2708 }
2709
2710 fn parse_q_string(&mut self, _start: usize) -> Option<Token> {
2711 // Simplified q-string parsing
2712 None
2713 }
2714
2715 #[inline]
2716 fn unterminated_string_error(&mut self, start: usize) -> Token {
2717 // Consume to EOF so the caller receives a single terminal error token.
2718 let end = self.input.len();
2719 self.position = end;
2720
2721 Token {
2722 token_type: TokenType::Error(Arc::from("unterminated string")),
2723 text: Arc::from(&self.input[start..end]),
2724 start,
2725 end,
2726 }
2727 }
2728
2729 fn parse_substitution(&mut self, start: usize) -> Option<Token> {
2730 // We've already consumed 's'
2731 let delimiter = self.current_char()?;
2732 self.advance(); // Skip delimiter
2733 self.parse_substitution_with_delimiter(start, delimiter)
2734 }
2735
2736 fn parse_substitution_with_delimiter(
2737 &mut self,
2738 start: usize,
2739 delimiter: char,
2740 ) -> Option<Token> {
2741 let (_pattern, pattern_closed) = self.read_delimited_body(delimiter);
2742 let replacement_closed;
2743
2744 let pattern_is_paired = quote_handler::paired_close(delimiter).is_some();
2745 if pattern_is_paired {
2746 self.skip_paired_substitution_replacement_gap();
2747
2748 if let Some(repl_delim) = self.current_char()
2749 && Self::is_quote_delim(repl_delim)
2750 {
2751 self.advance();
2752 let (_replacement, closed) = self.read_substitution_replacement_body(repl_delim);
2753 replacement_closed = closed;
2754 } else {
2755 replacement_closed = false;
2756 }
2757 } else {
2758 let (_replacement, closed) = self.read_substitution_replacement_body(delimiter);
2759 replacement_closed = closed;
2760 }
2761
2762 // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
2763 while let Some(ch) = self.current_char() {
2764 if ch.is_ascii_alphanumeric() {
2765 self.advance();
2766 } else {
2767 break;
2768 }
2769 }
2770
2771 let text = &self.input[start..self.position];
2772 self.mode = LexerMode::ExpectOperator;
2773
2774 let token_type = if pattern_closed && replacement_closed {
2775 TokenType::Substitution
2776 } else {
2777 TokenType::Error(Arc::from(format!(
2778 "unclosed quote-like operator 's' delimiter '{}'",
2779 delimiter
2780 )))
2781 };
2782
2783 Some(Token { token_type, text: Arc::from(text), start, end: self.position })
2784 }
2785
2786 fn skip_paired_substitution_replacement_gap(&mut self) {
2787 let mut comment_eligible = false;
2788 loop {
2789 let mut saw_whitespace = false;
2790 while self.current_char().is_some_and(char::is_whitespace) {
2791 self.advance();
2792 saw_whitespace = true;
2793 }
2794 comment_eligible |= saw_whitespace;
2795
2796 if comment_eligible && self.current_char() == Some('#') {
2797 while let Some(ch) = self.current_char() {
2798 self.advance();
2799 if matches!(ch, '\n' | '\r') {
2800 break;
2801 }
2802 }
2803 comment_eligible = true;
2804 continue;
2805 }
2806
2807 break;
2808 }
2809 }
2810
2811 fn read_substitution_replacement_body(&mut self, delim: char) -> (String, bool) {
2812 if quote_handler::paired_close(delim).is_some() {
2813 return self.read_delimited_body(delim);
2814 }
2815
2816 self.read_unpaired_substitution_replacement_body(delim)
2817 }
2818
2819 fn read_unpaired_substitution_replacement_body(&mut self, delim: char) -> (String, bool) {
2820 let mut body = String::new();
2821 let mut escaped = false;
2822
2823 while let Some(ch) = self.current_char() {
2824 if escaped {
2825 body.push(ch);
2826 self.advance();
2827 escaped = false;
2828 continue;
2829 }
2830
2831 match ch {
2832 '\\' => {
2833 body.push(ch);
2834 self.advance();
2835 escaped = true;
2836 }
2837 '"' | '\'' if ch != delim => {
2838 if let Some((string_end, true)) =
2839 self.scan_inner_string_for_delimiter(self.position, ch, delim)
2840 {
2841 if let Some(string_text) = self.input.get(self.position..string_end) {
2842 body.push_str(string_text);
2843 self.position = string_end;
2844 } else {
2845 body.push(ch);
2846 self.advance();
2847 }
2848 } else {
2849 body.push(ch);
2850 self.advance();
2851 }
2852 }
2853 c if c == delim => {
2854 self.advance();
2855 return (body, true);
2856 }
2857 _ => {
2858 body.push(ch);
2859 self.advance();
2860 }
2861 }
2862 }
2863
2864 (body, false)
2865 }
2866
2867 fn scan_inner_string_for_delimiter(
2868 &self,
2869 start: usize,
2870 quote: char,
2871 delim: char,
2872 ) -> Option<(usize, bool)> {
2873 if Self::is_word_apostrophe(self.input, start, quote) {
2874 return None;
2875 }
2876 // Adjacent quotes are literal replacement text (for example s/"/""/g),
2877 // not a string literal to skip while hunting for the replacement delimiter.
2878 if self.input.get(..start).and_then(|text| text.chars().next_back()) == Some(quote) {
2879 return None;
2880 }
2881 let mut pos = start.checked_add(quote.len_utf8())?;
2882 let expression_quote = Self::can_start_replacement_expression_quote(self.input, start);
2883 if !expression_quote && self.input.get(pos..).is_some_and(|text| text.starts_with(delim)) {
2884 return None;
2885 }
2886 if self.input.get(pos..).is_some_and(|text| text.starts_with(quote)) {
2887 return None;
2888 }
2889 let mut escaped = false;
2890 let mut contains_delim = false;
2891
2892 while let Some(ch) = self.input.get(pos..).and_then(|text| text.chars().next()) {
2893 if matches!(ch, '\n' | '\r') {
2894 return None;
2895 }
2896 if !expression_quote && matches!(ch, ';' | '#') {
2897 return None;
2898 }
2899
2900 if escaped {
2901 if ch == delim {
2902 contains_delim = true;
2903 }
2904 pos += ch.len_utf8();
2905 escaped = false;
2906 continue;
2907 }
2908
2909 match ch {
2910 '\\' => {
2911 pos += ch.len_utf8();
2912 escaped = true;
2913 }
2914 c if c == quote => {
2915 return Some((pos + ch.len_utf8(), contains_delim));
2916 }
2917 c if c == delim => {
2918 contains_delim = true;
2919 pos += ch.len_utf8();
2920 }
2921 _ => {
2922 pos += ch.len_utf8();
2923 }
2924 }
2925 }
2926
2927 None
2928 }
2929
2930 // Only skip delimiter-bearing inner strings in positions that look like
2931 // replacement expressions; literal replacement quotes still let the next
2932 // delimiter close the substitution.
2933 fn can_start_replacement_expression_quote(input: &str, pos: usize) -> bool {
2934 input
2935 .get(..pos)
2936 .and_then(|text| text.chars().rev().find(|ch| !ch.is_whitespace()))
2937 .is_some_and(|ch| {
2938 matches!(
2939 ch,
2940 '(' | '['
2941 | '{'
2942 | ','
2943 | '='
2944 | ':'
2945 | '?'
2946 | '!'
2947 | '~'
2948 | '+'
2949 | '-'
2950 | '*'
2951 | '%'
2952 | '&'
2953 | '|'
2954 | '^'
2955 | '<'
2956 | '>'
2957 )
2958 })
2959 }
2960
2961 fn is_word_apostrophe(input: &str, pos: usize, quote: char) -> bool {
2962 quote == '\''
2963 && input
2964 .get(..pos)
2965 .and_then(|text| text.chars().next_back())
2966 .is_some_and(|ch| ch.is_ascii_alphanumeric() || ch == '_')
2967 }
2968
2969 fn parse_transliteration(&mut self, start: usize) -> Option<Token> {
2970 // We've already consumed 'tr' or 'y'
2971 while self.current_char().is_some_and(char::is_whitespace) {
2972 self.advance();
2973 }
2974
2975 let delimiter = self.current_char()?;
2976 self.advance(); // Skip delimiter
2977 self.parse_transliteration_with_delimiter(start, delimiter)
2978 }
2979
2980 fn parse_transliteration_with_delimiter(
2981 &mut self,
2982 start: usize,
2983 delimiter: char,
2984 ) -> Option<Token> {
2985 let (_search, search_closed) = self.read_delimited_body(delimiter);
2986 let replacement_closed;
2987
2988 let search_is_paired = quote_handler::paired_close(delimiter).is_some();
2989 if search_is_paired {
2990 while self.current_char().is_some_and(char::is_whitespace) {
2991 self.advance();
2992 }
2993
2994 if let Some(repl_delim) = self.current_char()
2995 && Self::is_quote_delim(repl_delim)
2996 {
2997 self.advance();
2998 let (_replacement, closed) = self.read_delimited_body(repl_delim);
2999 replacement_closed = closed;
3000 } else {
3001 replacement_closed = false;
3002 }
3003 } else {
3004 let (_replacement, closed) = self.read_delimited_body(delimiter);
3005 replacement_closed = closed;
3006 }
3007
3008 // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
3009 while let Some(ch) = self.current_char() {
3010 if ch.is_ascii_alphanumeric() {
3011 self.advance();
3012 } else {
3013 break;
3014 }
3015 }
3016
3017 let text = &self.input[start..self.position];
3018 self.mode = LexerMode::ExpectOperator;
3019
3020 let token_type = if search_closed && replacement_closed {
3021 TokenType::Transliteration
3022 } else {
3023 TokenType::Error(Arc::from(format!(
3024 "unclosed quote-like operator '{}' delimiter '{}'",
3025 if self.input[start..].starts_with("tr") { "tr" } else { "y" },
3026 delimiter
3027 )))
3028 };
3029
3030 Some(Token { token_type, text: Arc::from(text), start, end: self.position })
3031 }
3032
3033 /// Read content between delimiters.
3034 ///
3035 /// Returns `(body, closed)` where `closed` is `true` if the closing
3036 /// delimiter was found before EOF, and `false` if EOF was reached first.
3037 fn read_delimited_body(&mut self, delim: char) -> (String, bool) {
3038 let paired = quote_handler::paired_close(delim);
3039 let close = paired.unwrap_or(delim);
3040 let mut body = String::new();
3041 let mut depth = i32::from(paired.is_some());
3042
3043 while let Some(ch) = self.current_char() {
3044 if ch == '\\' {
3045 body.push(ch);
3046 self.advance();
3047 if let Some(next) = self.current_char() {
3048 body.push(next);
3049 self.advance();
3050 }
3051 continue;
3052 }
3053
3054 if paired.is_some() && ch == delim {
3055 body.push(ch);
3056 self.advance();
3057 depth += 1;
3058 continue;
3059 }
3060
3061 if ch == close {
3062 if paired.is_some() {
3063 depth -= 1;
3064 if depth == 0 {
3065 self.advance();
3066 return (body, true);
3067 }
3068 body.push(ch);
3069 self.advance();
3070 } else {
3071 self.advance();
3072 return (body, true);
3073 }
3074 continue;
3075 }
3076
3077 body.push(ch);
3078 self.advance();
3079 }
3080
3081 // EOF reached without finding the closing delimiter
3082 (body, false)
3083 }
3084
3085 /// Parse a quote operator after we've seen the delimiter
3086 fn parse_quote_operator(&mut self, delimiter: char) -> Option<Token> {
3087 let info = self.current_quote_op.as_ref()?;
3088 let start = info.start_pos;
3089 let operator = info.operator.clone();
3090
3091 // Clear the quote-op context eagerly so any early-return path (s/tr/y delegations
3092 // below) does not leave a stale reference behind. The post-match cleanup at the
3093 // bottom of this function would otherwise be skipped for those operators.
3094 self.current_quote_op = None;
3095
3096 // Parse based on operator type; track whether all delimiters were closed.
3097 let closed = match operator.as_str() {
3098 "s" => {
3099 return self.parse_substitution_with_delimiter(start, delimiter);
3100 }
3101 "tr" | "y" => {
3102 return self.parse_transliteration_with_delimiter(start, delimiter);
3103 }
3104 "qr" => {
3105 let (_pattern, body_closed) = self.read_delimited_body(delimiter);
3106 self.parse_regex_modifiers("e_handler::QR_SPEC);
3107 body_closed
3108 }
3109 "m" => {
3110 let (_pattern, body_closed) = self.read_delimited_body(delimiter);
3111 self.parse_regex_modifiers("e_handler::M_SPEC);
3112 body_closed
3113 }
3114 _ => {
3115 // q, qq, qw, qx - no modifiers
3116 let (_body, body_closed) = self.read_delimited_body(delimiter);
3117 body_closed
3118 }
3119 };
3120
3121 let text = &self.input[start..self.position];
3122
3123 self.mode = LexerMode::ExpectOperator;
3124
3125 if !closed {
3126 // EOF reached before finding the closing delimiter — emit an error
3127 // token so the parser's recovery mechanism records a diagnostic.
3128 return Some(Token {
3129 token_type: TokenType::Error(Arc::from(format!(
3130 "unclosed {} delimiter '{}'",
3131 operator, delimiter
3132 ))),
3133 text: Arc::from(text),
3134 start,
3135 end: self.position,
3136 });
3137 }
3138
3139 let token_type = quote_handler::get_quote_token_type(&operator);
3140 Some(Token { token_type, text: Arc::from(text), start, end: self.position })
3141 }
3142
3143 /// Parse regex modifiers according to the given spec
3144 ///
3145 /// This function includes ALL characters that could be intended as modifiers,
3146 /// including invalid ones. This allows the parser to properly reject invalid
3147 /// modifiers with a clear error message, rather than leaving them as separate
3148 /// tokens that could be confusingly parsed.
3149 fn parse_regex_modifiers(&mut self, _spec: "e_handler::ModSpec) {
3150 // Consume all alphanumeric characters that could be intended as modifiers
3151 // The parser will validate and reject invalid ones
3152 while let Some(ch) = self.current_char() {
3153 if ch.is_ascii_alphanumeric() {
3154 self.advance();
3155 } else {
3156 break;
3157 }
3158 }
3159 // Note: We no longer validate here - the parser will validate and provide
3160 // clear error messages for invalid modifiers (MUT_005 fix)
3161 }
3162
3163 /// Parse a regex literal starting with `/`
3164 ///
3165 /// **Budget Protection (Issue #422)**:
3166 /// - Budget guards prevent runaway scanning on pathological input
3167 /// - `MAX_REGEX_PARSE_STEPS` bounds literal scanning before the byte budget
3168 /// - `MAX_REGEX_BYTES` bounds total bytes consumed in a single regex literal
3169 /// - Graceful degradation: emit UnknownRest token if budget exceeded
3170 ///
3171 /// **Performance**:
3172 /// - Single-pass scanning with escape handling
3173 /// - Budget check per iteration (amortized O(1) via inline fast path)
3174 /// - Typical regex: <10μs, Large regex (64KB): ~1ms
3175 fn parse_regex(&mut self, start: usize) -> Option<Token> {
3176 self.advance(); // Skip opening /
3177
3178 let mut regex_parse_steps: usize = 0;
3179 let mut in_character_class = false;
3180
3181 while let Some(ch) = self.current_char() {
3182 regex_parse_steps += 1;
3183 if regex_parse_steps > MAX_REGEX_PARSE_STEPS {
3184 #[cfg(debug_assertions)]
3185 {
3186 let text = &self.input[start..self.position];
3187 let preview = truncate_preview(text, 50);
3188 tracing::debug!(
3189 limit = MAX_REGEX_PARSE_STEPS,
3190 pattern_preview = %preview,
3191 "Regex parse step budget exceeded"
3192 );
3193 }
3194 self.position = self.input.len();
3195 return Some(Token {
3196 token_type: TokenType::UnknownRest,
3197 text: empty_arc(),
3198 start,
3199 end: self.position,
3200 });
3201 }
3202
3203 // Budget guard: prevent timeout on pathological input (Issue #422)
3204 // If exceeded, returns UnknownRest token for graceful degradation
3205 if let Some(token) = self.budget_guard(start, 0) {
3206 return Some(token);
3207 }
3208
3209 match ch {
3210 '/' if !in_character_class => {
3211 self.advance();
3212 // Parse flags - include all alphanumeric for proper validation in parser (MUT_005 fix)
3213 while let Some(ch) = self.current_char() {
3214 if ch.is_ascii_alphanumeric() {
3215 self.advance();
3216 } else {
3217 break;
3218 }
3219 }
3220
3221 let text = &self.input[start..self.position];
3222 self.mode = LexerMode::ExpectOperator;
3223
3224 return Some(Token {
3225 token_type: TokenType::RegexMatch,
3226 text: Arc::from(text),
3227 start,
3228 end: self.position,
3229 });
3230 }
3231 '\\' => {
3232 // Handle escape sequences: consume backslash + next char
3233 self.advance();
3234 if self.current_char().is_some() {
3235 self.advance();
3236 }
3237 }
3238 '[' => {
3239 in_character_class = true;
3240 self.advance();
3241 }
3242 ']' if in_character_class => {
3243 in_character_class = false;
3244 self.advance();
3245 }
3246 _ => self.advance(),
3247 }
3248 }
3249
3250 // Unterminated regex - EOF reached before closing /
3251 // Parser will emit diagnostic for unterminated literal
3252 None
3253 }
3254}
3255
3256// Checkpoint support for incremental parsing
3257
3258mod checkpoint_impl;
3259
3260#[cfg(test)]
3261mod test_format_debug;
3262#[cfg(test)]
3263mod tests;