perl_lexer/lib.rs
1//! Context-aware Perl lexer with mode-based tokenization
2//!
3//! This crate provides a high-performance lexer for Perl that handles the inherently
4//! context-sensitive nature of the language. The lexer uses a mode-tracking system to
5//! correctly disambiguate ambiguous syntax like `/` (division vs. regex) and properly
6//! parse complex constructs like heredocs, quote-like operators, and nested delimiters.
7//!
8//! # Architecture
9//!
10//! The lexer is organized around several key concepts:
11//!
12//! - **Mode Tracking**: [`LexerMode`] tracks whether the parser expects a term or an operator,
13//! enabling correct disambiguation of context-sensitive tokens.
14//! - **Checkpointing**: [`LexerCheckpoint`] and [`Checkpointable`] support incremental parsing
15//! by allowing the lexer state to be saved and restored.
16//! - **Budget Limits**: Protection against pathological input with configurable size limits
17//! for regex patterns, heredoc bodies, and delimiter nesting depth.
18//! - **Position Tracking**: [`Position`] maintains line/column information for error reporting
19//! and LSP integration.
20//! - **Unicode Support**: Full Unicode identifier support following Perl 5.14+ semantics.
21//!
22//! # Usage
23//!
24//! ## Basic Tokenization
25//!
26//! ```rust
27//! use perl_lexer::{PerlLexer, TokenType};
28//!
29//! let mut lexer = PerlLexer::new("my $x = 42;");
30//! let tokens = lexer.collect_tokens();
31//!
32//! // First token is the keyword `my`
33//! assert!(matches!(&tokens[0].token_type, TokenType::Keyword(k) if &**k == "my"));
34//! // Tokens include variables, operators, literals, and EOF
35//! assert!(matches!(&tokens.last().map(|t| &t.token_type), Some(TokenType::EOF)));
36//! ```
37//!
38//! ## Context-Aware Parsing
39//!
40//! The lexer automatically tracks context to disambiguate operators:
41//!
42//! ```rust
43//! use perl_lexer::{PerlLexer, TokenType};
44//!
45//! // Division operator (after a term)
46//! let mut lexer = PerlLexer::new("42 / 2");
47//! // Regex operator (at start of expression)
48//! let mut lexer2 = PerlLexer::new("/pattern/");
49//! ```
50//!
51//! ## Checkpointing for Incremental Parsing
52//!
53//! ```rust,ignore
54//! use perl_lexer::{PerlLexer, Checkpointable};
55//!
56//! let mut lexer = PerlLexer::new("my $x = 1;");
57//! let checkpoint = lexer.checkpoint();
58//!
59//! // Parse some tokens
60//! let _ = lexer.next_token();
61//!
62//! // Restore to checkpoint
63//! lexer.restore(&checkpoint);
64//! ```
65//!
66//! ## Configuration Options
67//!
68//! ```rust
69//! use perl_lexer::{PerlLexer, LexerConfig};
70//!
71//! let config = LexerConfig {
72//! parse_interpolation: true, // Parse string interpolation
73//! track_positions: true, // Track line/column positions
74//! max_lookahead: 1024, // Maximum lookahead for disambiguation
75//! };
76//!
77//! let mut lexer = PerlLexer::with_config("my $x = 1;", config);
78//! ```
79//!
80//! # Context Sensitivity Examples
81//!
82//! Perl's grammar is highly context-sensitive. The lexer handles these cases:
83//!
84//! - **Division vs. Regex**: `/` is division after terms, regex at expression start
85//! - **Modulo vs. Hash Sigil**: `%` is modulo after terms, hash sigil at expression start
86//! - **Glob vs. Exponent**: `**` can be exponentiation or glob pattern start
87//! - **Defined-or vs. Regex**: `//` is defined-or after terms, regex at expression start
88//! - **Heredoc Markers**: `<<` can be left shift, here-doc, or numeric less-than-less-than
89//!
90//! # Budget Limits
91//!
92//! To prevent hangs on pathological input, the lexer enforces these limits:
93//!
94//! - **MAX_REGEX_BYTES**: 64KB maximum for regex patterns
95//! - **MAX_HEREDOC_BYTES**: 256KB maximum for heredoc bodies
96//! - **MAX_DELIM_NEST**: 128 levels maximum nesting depth for delimiters
97//!
98//! When limits are exceeded, the lexer emits an `UnknownRest` token preserving
99//! all previously parsed symbols, allowing continued analysis.
100//!
101//! # Integration with perl-parser
102//!
103//! The lexer is designed to work seamlessly with `perl_parser::Parser`:
104//!
105//! ```rust,ignore
106//! use perl_parser::Parser;
107//!
108//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
109//! let code = "sub hello { print qq{Hello, world!\\n}; }";
110//! let mut parser = Parser::new(code);
111//! let ast = parser.parse()?;
112//! # Ok(())
113//! # }
114//! ```
115//!
116//! The parser automatically creates and manages a `PerlLexer` instance internally.
117
118#![warn(clippy::all)]
119#![allow(
120 // Core allows for lexer code
121 clippy::too_many_lines,
122 clippy::module_name_repetitions,
123 clippy::cast_possible_truncation,
124 clippy::cast_sign_loss,
125 clippy::cast_possible_wrap,
126 clippy::cast_precision_loss,
127 clippy::must_use_candidate,
128 clippy::missing_errors_doc,
129 clippy::missing_panics_doc,
130
131 // Lexer-specific patterns that are fine
132 clippy::match_same_arms,
133 clippy::redundant_else,
134 clippy::unnecessary_wraps,
135 clippy::unused_self,
136 clippy::items_after_statements,
137 clippy::struct_excessive_bools,
138 clippy::uninlined_format_args
139)]
140
141use perl_keywords::is_lexer_keyword;
142use std::sync::{Arc, OnceLock};
143
144pub mod checkpoint;
145pub mod error;
146pub mod mode;
147mod quote_handler;
148pub mod token;
149mod unicode;
150
151pub use checkpoint::{CheckpointCache, Checkpointable, LexerCheckpoint};
152pub use error::{LexerError, Result};
153pub use mode::LexerMode;
154pub use perl_position_tracking::Position;
155pub use token::{StringPart, Token, TokenType};
156
157use unicode::{is_perl_identifier_continue, is_perl_identifier_start};
158
159/// Specification for a pending heredoc
160#[derive(Clone)]
161struct HeredocSpec {
162 label: Arc<str>,
163 body_start: usize, // byte offset where the body begins
164 allow_indent: bool, // true if we saw <<~ (Perl 5.26 indented heredocs)
165}
166
167// Budget limits to prevent hangs on pathological input
168// When these limits are exceeded, the lexer gracefully truncates the token
169// as UnknownRest, preserving all previously parsed symbols and allowing
170// continued analysis of the remainder. LSP clients may emit a soft diagnostic
171// about truncation but won't crash or hang.
172const MAX_REGEX_BYTES: usize = 64 * 1024; // 64KB max for regex patterns
173const MAX_HEREDOC_BYTES: usize = 256 * 1024; // 256KB max for heredoc bodies
174const MAX_DELIM_NEST: usize = 128; // Max nesting depth for delimiters
175const MAX_HEREDOC_DEPTH: usize = 100; // Max nesting depth for heredocs
176const HEREDOC_TIMEOUT_MS: u64 = 5000; // 5 seconds timeout for heredoc parsing
177
178/// Configuration for the lexer
179#[derive(Debug, Clone)]
180pub struct LexerConfig {
181 /// Enable interpolation parsing in strings
182 pub parse_interpolation: bool,
183 /// Track token positions for error reporting
184 pub track_positions: bool,
185 /// Maximum lookahead for disambiguation
186 pub max_lookahead: usize,
187}
188
189impl Default for LexerConfig {
190 fn default() -> Self {
191 Self { parse_interpolation: true, track_positions: true, max_lookahead: 1024 }
192 }
193}
194
195/// Mode-aware Perl lexer
196pub struct PerlLexer<'a> {
197 input: &'a str,
198 /// Cached input bytes for faster access
199 input_bytes: &'a [u8],
200 position: usize,
201 mode: LexerMode,
202 config: LexerConfig,
203 /// Stack for nested delimiters in s{}{} constructs
204 delimiter_stack: Vec<char>,
205 /// Track if we're inside prototype parens after 'sub'
206 in_prototype: bool,
207 /// Paren depth to track when we exit prototype
208 prototype_depth: usize,
209 /// Current position with line/column tracking
210 #[allow(dead_code)]
211 current_pos: Position,
212 /// Track if we just skipped a newline (for __DATA__/__END__ detection)
213 after_newline: bool,
214 /// Queue of pending heredocs waiting for their bodies
215 pending_heredocs: Vec<HeredocSpec>,
216 /// Track the byte offset of the current line's start
217 line_start_offset: usize,
218 /// If true, emit `HeredocBody` tokens; otherwise just consume them.
219 emit_heredoc_body_tokens: bool,
220 /// Current quote operator being parsed
221 current_quote_op: Option<quote_handler::QuoteOperatorInfo>,
222 /// Track if EOF has been emitted to prevent infinite loops
223 eof_emitted: bool,
224 /// Start time for timeout protection
225 start_time: std::time::Instant,
226}
227
228impl<'a> PerlLexer<'a> {
229 /// Create a new lexer for the given input
230 pub fn new(input: &'a str) -> Self {
231 Self::with_config(input, LexerConfig::default())
232 }
233
234 /// Create a new lexer with custom configuration
235 pub fn with_config(input: &'a str, config: LexerConfig) -> Self {
236 Self {
237 input,
238 input_bytes: input.as_bytes(),
239 position: 0,
240 mode: LexerMode::ExpectTerm,
241 config,
242 delimiter_stack: Vec::new(),
243 in_prototype: false,
244 prototype_depth: 0,
245 current_pos: Position::start(),
246 after_newline: true, // Start of file counts as after newline
247 pending_heredocs: Vec::new(),
248 line_start_offset: 0,
249 emit_heredoc_body_tokens: false,
250 current_quote_op: None,
251 eof_emitted: false,
252 start_time: std::time::Instant::now(),
253 }
254 }
255
256 /// Create a new lexer that emits `HeredocBody` tokens (for LSP folding)
257 pub fn with_body_tokens(input: &'a str) -> Self {
258 let mut lexer = Self::new(input);
259 lexer.emit_heredoc_body_tokens = true;
260 lexer
261 }
262
263 /// Normalize file start by skipping BOM if present
264 fn normalize_file_start(&mut self) {
265 // Skip UTF-8 BOM (EF BB BF) if at file start
266 if self.position == 0 && self.matches_bytes(&[0xEF, 0xBB, 0xBF]) {
267 self.position = 3;
268 self.line_start_offset = 3;
269 }
270 }
271
272 /// Set the lexer mode (for resetting state at statement boundaries)
273 pub fn set_mode(&mut self, mode: LexerMode) {
274 self.mode = mode;
275 }
276
277 /// Helper to check if remaining bytes on a line are only spaces/tabs
278 #[inline]
279 fn trailing_ws_only(bytes: &[u8], mut p: usize) -> bool {
280 while p < bytes.len() && bytes[p] != b'\n' && bytes[p] != b'\r' {
281 match bytes[p] {
282 b' ' | b'\t' => p += 1,
283 _ => return false,
284 }
285 }
286 true
287 }
288
289 /// Consume a newline sequence (CRLF or LF) and update state
290 #[inline]
291 fn consume_newline(&mut self) {
292 if self.position >= self.input.len() {
293 return;
294 }
295 match self.input_bytes[self.position] {
296 b'\r' => {
297 self.position += 1;
298 if self.position < self.input.len() && self.input_bytes[self.position] == b'\n' {
299 self.position += 1;
300 }
301 }
302 b'\n' => self.advance(),
303 _ => return, // not at a newline
304 }
305 self.after_newline = true;
306 self.line_start_offset = self.position;
307 }
308
309 /// Find the end of the current line, returning both raw end and visible end (without trailing CR)
310 #[inline]
311 fn find_line_end(bytes: &[u8], start: usize) -> (usize, usize) {
312 let mut end = start;
313 while end < bytes.len() && bytes[end] != b'\n' && bytes[end] != b'\r' {
314 end += 1;
315 }
316 // Visible end strips trailing \r if followed by \n
317 let visible_end = if end > start && end > 0 && bytes[end.saturating_sub(1)] == b'\r' {
318 end - 1
319 } else {
320 end
321 };
322 (end, visible_end)
323 }
324
325 /// Get the next token from the input
326 pub fn next_token(&mut self) -> Option<Token> {
327 // Normalize file start (BOM) once
328 if self.position == 0 {
329 self.normalize_file_start();
330 }
331
332 // Loop to avoid recursion when processing heredocs
333 loop {
334 // Handle format body parsing if we're in that mode
335 if matches!(self.mode, LexerMode::InFormatBody) {
336 return self.parse_format_body();
337 }
338
339 // Handle data section parsing if we're in that mode
340 if matches!(self.mode, LexerMode::InDataSection) {
341 return self.parse_data_body();
342 }
343
344 // Check if we're inside a heredoc body BEFORE skipping whitespace
345 let mut found_terminator = false;
346 if !self.pending_heredocs.is_empty() {
347 // Clone what we need to avoid holding a borrow
348 let (body_start, label, allow_indent) =
349 if let Some(spec) = self.pending_heredocs.first() {
350 if spec.body_start > 0
351 && self.position >= spec.body_start
352 && self.position < self.input.len()
353 {
354 (spec.body_start, spec.label.clone(), spec.allow_indent)
355 } else {
356 // Not in a heredoc body yet or at EOF
357 (0, empty_arc(), false)
358 }
359 } else {
360 (0, empty_arc(), false)
361 };
362
363 if body_start > 0 {
364 // We're inside a heredoc body - scan for the terminator
365
366 // Scan line by line looking for the terminator
367 while self.position < self.input.len() {
368 // Timeout protection (Issue #443)
369 if self.start_time.elapsed().as_millis() > HEREDOC_TIMEOUT_MS as u128 {
370 self.pending_heredocs.remove(0);
371 self.position = self.input.len();
372 return Some(Token {
373 token_type: TokenType::Error(Arc::from("Heredoc parsing timeout")),
374 text: Arc::from(&self.input[body_start..]),
375 start: body_start,
376 end: self.input.len(),
377 });
378 }
379
380 // Budget cap for huge bodies - optimized check
381 if self.position - body_start > MAX_HEREDOC_BYTES {
382 // Remove the pending heredoc to avoid infinite loop
383 self.pending_heredocs.remove(0);
384 self.position = self.input.len();
385 return Some(Token {
386 token_type: TokenType::UnknownRest,
387 text: Arc::from(&self.input[body_start..]),
388 start: body_start,
389 end: self.input.len(),
390 });
391 }
392
393 // Skip to start of next line if not at line start
394 // Exception: if we're at body_start exactly, we're at the heredoc body start
395 if !self.after_newline && self.position != body_start {
396 while self.position < self.input.len()
397 && self.input_bytes[self.position] != b'\n'
398 && self.input_bytes[self.position] != b'\r'
399 {
400 self.advance();
401 }
402 self.consume_newline();
403 continue;
404 }
405
406 // We're at line start - check if this line is the terminator
407 let line_start = self.position;
408 let (line_end, line_visible_end) =
409 Self::find_line_end(self.input_bytes, self.position);
410 let line = &self.input[line_start..line_visible_end];
411 // Strip trailing spaces/tabs (Perl allows them)
412 let trimmed_end = line.trim_end_matches([' ', '\t']);
413
414 // Check if this line is the terminator
415 let is_terminator = if allow_indent {
416 // Allow any leading spaces/tabs before the label
417 let mut p = 0;
418 while p < trimmed_end.len() {
419 let b = trimmed_end.as_bytes()[p];
420 if b == b' ' || b == b'\t' {
421 p += 1;
422 } else {
423 break;
424 }
425 }
426 trimmed_end[p..] == *label
427 } else {
428 // Must start at column 0 (no leading whitespace)
429 // The terminator is just the label (already trimmed trailing whitespace)
430 trimmed_end == &*label
431 };
432
433 if is_terminator {
434 // Found the terminator!
435 self.pending_heredocs.remove(0);
436 found_terminator = true;
437
438 // Consume past the terminator line
439 self.position = line_end;
440 self.consume_newline();
441
442 // Set body_start for the next pending heredoc (if any)
443 if let Some(next) = self.pending_heredocs.first_mut()
444 && next.body_start == 0
445 {
446 next.body_start = self.position;
447 }
448
449 // Only emit HeredocBody if requested (for folding)
450 if self.emit_heredoc_body_tokens {
451 return Some(Token {
452 token_type: TokenType::HeredocBody(empty_arc()),
453 text: empty_arc(),
454 start: body_start,
455 end: line_start,
456 });
457 }
458 // Otherwise, continue the outer loop to get the next real token (avoiding recursion)
459 break; // Break inner while loop, continue outer loop
460 }
461
462 // Not the terminator, continue to next line
463 self.position = line_end;
464 self.consume_newline();
465 }
466
467 // If we didn't find a terminator, we reached EOF - emit error token
468 if !found_terminator {
469 // Remove the pending heredoc to avoid infinite loop
470 self.pending_heredocs.remove(0);
471 self.position = self.input.len();
472 return Some(Token {
473 token_type: TokenType::UnknownRest,
474 text: Arc::from(&self.input[body_start..]),
475 start: body_start,
476 end: self.input.len(),
477 });
478 }
479 }
480
481 // If we found a terminator, continue outer loop to get next token
482 if found_terminator {
483 continue; // Continue outer loop to get next token
484 }
485 }
486
487 self.skip_whitespace_and_comments()?;
488
489 // Check again if we're now in a heredoc body (might have been set during skip_whitespace)
490 if !self.pending_heredocs.is_empty()
491 && let Some(spec) = self.pending_heredocs.first()
492 && spec.body_start > 0
493 && self.position >= spec.body_start
494 && self.position < self.input.len()
495 {
496 continue; // Go back to top of loop to process heredoc
497 }
498
499 // If we reach EOF with pending heredocs, clear them and emit EOF
500 if self.position >= self.input.len() && !self.pending_heredocs.is_empty() {
501 self.pending_heredocs.clear();
502 }
503
504 if self.position >= self.input.len() {
505 if self.eof_emitted {
506 return None; // Stop the stream
507 }
508 self.eof_emitted = true;
509 return Some(Token {
510 token_type: TokenType::EOF,
511 text: empty_arc(),
512 start: self.position,
513 end: self.position,
514 });
515 }
516
517 let start = self.position;
518
519 // Check for special tokens first
520 if let Some(token) = self.try_heredoc() {
521 return Some(token);
522 }
523
524 if let Some(token) = self.try_string() {
525 return Some(token);
526 }
527
528 if let Some(token) = self.try_variable() {
529 return Some(token);
530 }
531
532 if let Some(token) = self.try_number() {
533 return Some(token);
534 }
535
536 if let Some(token) = self.try_identifier_or_keyword() {
537 return Some(token);
538 }
539
540 // If we're expecting a delimiter for a quote operator, only try delimiter
541 if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
542 if let Some(token) = self.try_delimiter() {
543 return Some(token);
544 }
545 // Do NOT fall through to try_operator / try_punct / etc.
546 // Clear state first so we don't spin
547 self.mode = LexerMode::ExpectOperator;
548 self.current_quote_op = None;
549 continue;
550 }
551
552 if let Some(token) = self.try_operator() {
553 return Some(token);
554 }
555
556 if let Some(token) = self.try_delimiter() {
557 return Some(token);
558 }
559
560 // If nothing else matches, return an error token
561 let ch = self.current_char()?;
562 self.advance();
563
564 // Optimize error token creation - avoid expensive formatting in hot path
565 let text = if ch.is_ascii() {
566 // Fast path for ASCII characters
567 Arc::from(&self.input[start..self.position])
568 } else {
569 // Slower path for Unicode
570 Arc::from(ch.to_string())
571 };
572
573 return Some(Token {
574 token_type: TokenType::Error(Arc::from("Unexpected character")),
575 text,
576 start,
577 end: self.position,
578 });
579 } // End of loop
580 }
581
582 /// Budget guard to prevent infinite loops and timeouts (Issue #422)
583 ///
584 /// **Purpose**: Protect against pathological input that could cause:
585 /// - Infinite loops in regex/heredoc parsing
586 /// - Excessive memory consumption
587 /// - LSP server hangs
588 ///
589 /// **Limits**:
590 /// - `MAX_REGEX_BYTES` (64KB): Maximum bytes in a single regex literal
591 /// - `MAX_DELIM_NEST` (128): Maximum delimiter nesting depth
592 ///
593 /// **Graceful Degradation**:
594 /// - Budget exceeded → emit `UnknownRest` token
595 /// - Jump to EOF to prevent further parsing of problematic region
596 /// - LSP client can emit soft diagnostic about truncation
597 /// - All previously parsed symbols remain valid
598 ///
599 /// **Performance**:
600 /// - Fast path: inlined subtraction + comparison (~1-2 CPU cycles)
601 /// - Slow path: Only triggered on pathological input
602 /// - Amortized cost: O(1) per token
603 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
604 #[inline(always)]
605 fn budget_guard(&mut self, start: usize, depth: usize) -> Option<Token> {
606 // Fast path: most calls won't hit limits
607 let bytes_consumed = self.position - start;
608 if bytes_consumed <= MAX_REGEX_BYTES && depth <= MAX_DELIM_NEST {
609 return None;
610 }
611
612 // Slow path: budget exceeded - graceful degradation
613 // Note: In production LSP, this event could be logged/metered for monitoring
614 #[cfg(debug_assertions)]
615 {
616 eprintln!(
617 "Budget exceeded: bytes={}, depth={}, at position={}",
618 bytes_consumed, depth, self.position
619 );
620 }
621
622 self.position = self.input.len();
623 Some(Token {
624 token_type: TokenType::UnknownRest,
625 text: Arc::from(""),
626 start,
627 end: self.position,
628 })
629 }
630
631 /// Peek at the next token without consuming it
632 pub fn peek_token(&mut self) -> Option<Token> {
633 let saved_pos = self.position;
634 let saved_mode = self.mode;
635 let saved_prototype = self.in_prototype;
636 let saved_depth = self.prototype_depth;
637 let saved_after_newline = self.after_newline;
638
639 let token = self.next_token();
640
641 self.position = saved_pos;
642 self.mode = saved_mode;
643 self.in_prototype = saved_prototype;
644 self.prototype_depth = saved_depth;
645 self.after_newline = saved_after_newline;
646
647 token
648 }
649
650 /// Get all remaining tokens
651 pub fn collect_tokens(&mut self) -> Vec<Token> {
652 let mut tokens = Vec::new();
653 while let Some(token) = self.next_token() {
654 if token.token_type == TokenType::EOF {
655 tokens.push(token);
656 break;
657 }
658 tokens.push(token);
659 }
660 tokens
661 }
662
663 /// Reset the lexer to the beginning
664 pub fn reset(&mut self) {
665 self.position = 0;
666 self.mode = LexerMode::ExpectTerm;
667 self.delimiter_stack.clear();
668 self.in_prototype = false;
669 self.prototype_depth = 0;
670 self.after_newline = true;
671 self.pending_heredocs.clear();
672 self.line_start_offset = 0;
673 }
674
675 /// Switch lexer to format body parsing mode
676 pub fn enter_format_mode(&mut self) {
677 self.mode = LexerMode::InFormatBody;
678 }
679
680 // Internal helper methods
681
682 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
683 #[inline(always)]
684 fn byte_at(bytes: &[u8], index: usize) -> u8 {
685 debug_assert!(index < bytes.len());
686 match bytes.get(index) {
687 Some(&byte) => byte,
688 None => 0,
689 }
690 }
691
692 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
693 #[inline(always)]
694 fn current_char(&self) -> Option<char> {
695 if self.position < self.input_bytes.len() {
696 // For ASCII, direct access is safe
697 let byte = Self::byte_at(self.input_bytes, self.position);
698 if byte < 128 {
699 Some(byte as char)
700 } else {
701 // For non-ASCII, fall back to proper UTF-8 parsing
702 self.input.get(self.position..).and_then(|s| s.chars().next())
703 }
704 } else {
705 None
706 }
707 }
708
709 #[inline(always)]
710 fn peek_char(&self, offset: usize) -> Option<char> {
711 if offset > self.config.max_lookahead {
712 return None;
713 }
714
715 let pos = self.position.checked_add(offset)?;
716 if pos < self.input_bytes.len() {
717 // For ASCII, direct access is safe
718 let byte = Self::byte_at(self.input_bytes, pos);
719 if byte < 128 {
720 Some(byte as char)
721 } else {
722 // For non-ASCII, use chars iterator
723 self.input.get(self.position..).and_then(|s| s.chars().nth(offset))
724 }
725 } else {
726 None
727 }
728 }
729
730 #[allow(clippy::inline_always)] // Performance critical in lexer hot path
731 #[inline(always)]
732 fn advance(&mut self) {
733 if self.position < self.input_bytes.len() {
734 let byte = Self::byte_at(self.input_bytes, self.position);
735 if byte < 128 {
736 // ASCII fast path
737 self.position += 1;
738 } else if let Some(ch) = self.input.get(self.position..).and_then(|s| s.chars().next())
739 {
740 self.position += ch.len_utf8();
741 }
742 }
743 }
744
745 /// Fast byte-level check for ASCII characters
746 #[inline]
747 fn peek_byte(&self, offset: usize) -> Option<u8> {
748 if offset > self.config.max_lookahead {
749 return None;
750 }
751
752 let pos = self.position.checked_add(offset)?;
753 if pos < self.input_bytes.len() { Some(self.input_bytes[pos]) } else { None }
754 }
755
756 /// Check if the next bytes match a pattern (ASCII only)
757 #[inline]
758 fn matches_bytes(&self, pattern: &[u8]) -> bool {
759 let Some(end_offset) = pattern.len().checked_sub(1) else {
760 return true;
761 };
762
763 if end_offset > self.config.max_lookahead {
764 return false;
765 }
766
767 let Some(end) = self.position.checked_add(pattern.len()) else {
768 return false;
769 };
770
771 if end <= self.input_bytes.len() {
772 &self.input_bytes[self.position..end] == pattern
773 } else {
774 false
775 }
776 }
777
778 #[inline]
779 fn skip_whitespace_and_comments(&mut self) -> Option<()> {
780 // Don't reset after_newline if we're at the start of a line
781 if self.position > 0 && self.position != self.line_start_offset {
782 self.after_newline = false;
783 }
784
785 while self.position < self.input_bytes.len() {
786 let byte = Self::byte_at(self.input_bytes, self.position);
787 match byte {
788 // Fast path for ASCII whitespace - batch process
789 b' ' => {
790 // Batch skip spaces for better cache efficiency
791 let start = self.position;
792 while self.position < self.input_bytes.len()
793 && Self::byte_at(self.input_bytes, self.position) == b' '
794 {
795 self.position += 1;
796 }
797 // Continue outer loop if we processed any spaces
798 if self.position > start {
799 // Loop naturally continues to next iteration
800 }
801 }
802 b'\t' => {
803 // Batch skip tabs
804 let start = self.position;
805 while self.position < self.input_bytes.len()
806 && Self::byte_at(self.input_bytes, self.position) == b'\t'
807 {
808 self.position += 1;
809 }
810 if self.position > start {
811 // Loop naturally continues to next iteration
812 }
813 }
814 b'\r' | b'\n' => {
815 self.consume_newline();
816
817 // Set body_start for the FIRST pending heredoc that needs it (FIFO)
818 // Only check if we have pending heredocs to avoid unnecessary work
819 if !self.pending_heredocs.is_empty() {
820 for spec in &mut self.pending_heredocs {
821 if spec.body_start == 0 {
822 spec.body_start = self.position;
823 break; // Only set for the first unresolved heredoc
824 }
825 }
826 }
827 }
828 b'#' => {
829 // In ExpectDelimiter mode, '#' is a delimiter, not a comment
830 if matches!(self.mode, LexerMode::ExpectDelimiter) {
831 break;
832 }
833
834 // Skip line comment using memchr for fast newline search
835 self.position += 1; // Skip # directly
836
837 // Use memchr to find newline quickly
838 if let Some(newline_offset) =
839 memchr::memchr(b'\n', &self.input_bytes[self.position..])
840 {
841 self.position += newline_offset;
842 } else {
843 // No newline found, skip to end
844 self.position = self.input_bytes.len();
845 }
846 }
847 b'=' if self.position == 0
848 || (self.position > 0 && self.input_bytes[self.position - 1] == b'\n') =>
849 {
850 // Check if this starts a POD section (=pod, =head, =over, etc.)
851 // Use byte-safe checks — avoid slicing &str at arbitrary byte positions
852 let remaining = &self.input_bytes[self.position..];
853 if remaining.starts_with(b"=pod")
854 || remaining.starts_with(b"=head")
855 || remaining.starts_with(b"=over")
856 || remaining.starts_with(b"=item")
857 || remaining.starts_with(b"=back")
858 || remaining.starts_with(b"=begin")
859 || remaining.starts_with(b"=end")
860 || remaining.starts_with(b"=for")
861 || remaining.starts_with(b"=encoding")
862 {
863 // Scan forward for \n=cut (end of POD block)
864 let search_start = self.position;
865 let mut found_cut = false;
866 let bytes = self.input_bytes;
867 let mut i = search_start;
868 while i < bytes.len() {
869 // Look for =cut at the start of a line
870 if (i == 0 || bytes[i - 1] == b'\n') && bytes[i..].starts_with(b"=cut")
871 {
872 i += 4; // Skip "=cut"
873 // Skip rest of the =cut line
874 while i < bytes.len() && bytes[i] != b'\n' {
875 i += 1;
876 }
877 // Consume the trailing newline if present
878 if i < bytes.len() && bytes[i] == b'\n' {
879 i += 1;
880 }
881 self.position = i;
882 found_cut = true;
883 break;
884 }
885 i += 1;
886 }
887 if !found_cut {
888 // POD extends to end of file
889 self.position = bytes.len();
890 }
891 continue;
892 }
893 // Not a POD directive - regular '=' token
894 break;
895 }
896 _ => {
897 // For non-ASCII whitespace, use char check only when needed
898 if byte >= 128
899 && let Some(ch) = self.current_char()
900 && ch.is_whitespace()
901 {
902 self.advance();
903 continue;
904 }
905 break;
906 }
907 }
908 }
909 Some(())
910 }
911
912 fn try_heredoc(&mut self) -> Option<Token> {
913 // Check for heredoc start
914 if self.peek_byte(0) != Some(b'<') || self.peek_byte(1) != Some(b'<') {
915 return None;
916 }
917
918 let start = self.position;
919 let mut text = String::from("<<");
920 self.position += 2; // Skip <<
921
922 // Check for indented heredoc (~)
923 let allow_indent = if self.current_char() == Some('~') {
924 text.push('~');
925 self.advance();
926 true
927 } else {
928 false
929 };
930
931 // Skip whitespace
932 while let Some(ch) = self.current_char() {
933 if ch == ' ' || ch == '\t' {
934 text.push(ch);
935 self.advance();
936 } else {
937 break;
938 }
939 }
940
941 // Optional backslash disables interpolation, treat like single-quoted label
942 let backslashed = if self.current_char() == Some('\\') {
943 text.push('\\');
944 self.advance();
945 true
946 } else {
947 false
948 };
949
950 // Parse delimiter
951 let delimiter = if self.position < self.input.len() {
952 match self.current_char() {
953 Some('"') if !backslashed => {
954 // Double-quoted delimiter
955 text.push('"');
956 self.advance();
957 let mut delim = String::new();
958 while self.position < self.input.len() {
959 if let Some(ch) = self.current_char() {
960 if ch == '"' {
961 text.push('"');
962 self.advance();
963 break;
964 }
965 delim.push(ch);
966 text.push(ch);
967 self.advance();
968 } else {
969 break;
970 }
971 }
972 delim
973 }
974 Some('\'') if !backslashed => {
975 // Single-quoted delimiter
976 text.push('\'');
977 self.advance();
978 let mut delim = String::new();
979 while self.position < self.input.len() {
980 if let Some(ch) = self.current_char() {
981 if ch == '\'' {
982 text.push('\'');
983 self.advance();
984 break;
985 }
986 delim.push(ch);
987 text.push(ch);
988 self.advance();
989 } else {
990 break;
991 }
992 }
993 delim
994 }
995 Some('`') if !backslashed => {
996 // Backtick delimiter
997 text.push('`');
998 self.advance();
999 let mut delim = String::new();
1000 while self.position < self.input.len() {
1001 if let Some(ch) = self.current_char() {
1002 if ch == '`' {
1003 text.push('`');
1004 self.advance();
1005 break;
1006 }
1007 delim.push(ch);
1008 text.push(ch);
1009 self.advance();
1010 } else {
1011 break;
1012 }
1013 }
1014 delim
1015 }
1016 Some(c) if is_perl_identifier_start(c) => {
1017 // Bare word delimiter
1018 let mut delim = String::new();
1019 while self.position < self.input.len() {
1020 if let Some(c) = self.current_char() {
1021 if is_perl_identifier_continue(c) {
1022 delim.push(c);
1023 text.push(c);
1024 self.advance();
1025 } else {
1026 break;
1027 }
1028 } else {
1029 break;
1030 }
1031 }
1032 delim
1033 }
1034 _ => {
1035 // Not a valid heredoc delimiter - reset position and return None
1036 // This allows << to be parsed as bitshift operator (e.g., 1 << 2)
1037 self.position = start;
1038 return None;
1039 }
1040 }
1041 } else {
1042 // No delimiter found - reset position and return None
1043 self.position = start;
1044 return None;
1045 };
1046
1047 // For now, return a placeholder token
1048 // The actual heredoc body would be parsed later when we encounter it
1049 self.mode = LexerMode::ExpectOperator;
1050
1051 // Recursion depth limit (Issue #443)
1052 if self.pending_heredocs.len() >= MAX_HEREDOC_DEPTH {
1053 return Some(Token {
1054 token_type: TokenType::Error(Arc::from("Heredoc nesting too deep")),
1055 text: Arc::from(text),
1056 start,
1057 end: self.position,
1058 });
1059 }
1060
1061 // Queue the heredoc spec with its label
1062 self.pending_heredocs.push(HeredocSpec {
1063 label: Arc::from(delimiter.as_str()),
1064 body_start: 0, // Will be set when we see the newline after this line
1065 allow_indent,
1066 });
1067
1068 Some(Token {
1069 token_type: TokenType::HeredocStart,
1070 text: Arc::from(text),
1071 start,
1072 end: self.position,
1073 })
1074 }
1075
1076 fn try_string(&mut self) -> Option<Token> {
1077 let start = self.position;
1078 let quote = self.current_char()?;
1079
1080 match quote {
1081 '"' => self.parse_double_quoted_string(start),
1082 '\'' => self.parse_single_quoted_string(start),
1083 '`' => self.parse_backtick_string(start),
1084 'q' if self.peek_char(1) == Some('{') => self.parse_q_string(start),
1085 _ => None,
1086 }
1087 }
1088
1089 #[inline]
1090 fn try_number(&mut self) -> Option<Token> {
1091 let start = self.position;
1092
1093 // Fast byte check for digits - optimized bounds checking
1094 let bytes = self.input_bytes;
1095 if self.position >= bytes.len() || !Self::byte_at(bytes, self.position).is_ascii_digit() {
1096 return None;
1097 }
1098
1099 // Check for hex (0x), binary (0b), or octal (0o) prefixes
1100 let mut pos = self.position;
1101 if Self::byte_at(bytes, pos) == b'0' && pos + 1 < bytes.len() {
1102 let prefix_byte = bytes[pos + 1];
1103 if prefix_byte == b'x' || prefix_byte == b'X' {
1104 // Hexadecimal: 0x[0-9a-fA-F_]+
1105 pos += 2; // consume '0x'
1106 let digit_start = pos;
1107 while pos < bytes.len() && (bytes[pos].is_ascii_hexdigit() || bytes[pos] == b'_') {
1108 pos += 1;
1109 }
1110 if pos > digit_start {
1111 self.position = pos;
1112 let text = &self.input[start..self.position];
1113 self.mode = LexerMode::ExpectOperator;
1114 return Some(Token {
1115 token_type: TokenType::Number(Arc::from(text)),
1116 text: Arc::from(text),
1117 start,
1118 end: self.position,
1119 });
1120 }
1121 // No hex digits after 0x - fall through to parse '0' as decimal
1122 } else if prefix_byte == b'b' || prefix_byte == b'B' {
1123 // Binary: 0b[01_]+
1124 pos += 2; // consume '0b'
1125 let digit_start = pos;
1126 while pos < bytes.len()
1127 && (bytes[pos] == b'0' || bytes[pos] == b'1' || bytes[pos] == b'_')
1128 {
1129 pos += 1;
1130 }
1131 if pos > digit_start {
1132 self.position = pos;
1133 let text = &self.input[start..self.position];
1134 self.mode = LexerMode::ExpectOperator;
1135 return Some(Token {
1136 token_type: TokenType::Number(Arc::from(text)),
1137 text: Arc::from(text),
1138 start,
1139 end: self.position,
1140 });
1141 }
1142 // No binary digits after 0b - fall through to parse '0' as decimal
1143 } else if prefix_byte == b'o' || prefix_byte == b'O' {
1144 // Octal (explicit): 0o[0-7_]+
1145 pos += 2; // consume '0o'
1146 let digit_start = pos;
1147 while pos < bytes.len()
1148 && ((bytes[pos] >= b'0' && bytes[pos] <= b'7') || bytes[pos] == b'_')
1149 {
1150 pos += 1;
1151 }
1152 if pos > digit_start {
1153 self.position = pos;
1154 let text = &self.input[start..self.position];
1155 self.mode = LexerMode::ExpectOperator;
1156 return Some(Token {
1157 token_type: TokenType::Number(Arc::from(text)),
1158 text: Arc::from(text),
1159 start,
1160 end: self.position,
1161 });
1162 }
1163 // No octal digits after 0o - fall through to parse '0' as decimal
1164 }
1165 }
1166
1167 // Consume initial digits - unrolled for better performance
1168 pos = self.position;
1169 while pos < bytes.len() {
1170 let byte = Self::byte_at(bytes, pos);
1171 if byte.is_ascii_digit() || byte == b'_' {
1172 pos += 1;
1173 } else {
1174 break;
1175 }
1176 }
1177 self.position = pos;
1178
1179 // Check for decimal point - optimized with single bounds check
1180 if pos < bytes.len() && Self::byte_at(bytes, pos) == b'.' {
1181 // Peek ahead to see what follows the dot
1182 let has_following_digit = pos + 1 < bytes.len() && bytes[pos + 1].is_ascii_digit();
1183
1184 // Optimized dot consumption logic
1185 let should_consume_dot = has_following_digit || {
1186 pos + 1 >= bytes.len() || {
1187 // Use bitwise operations for faster character classification
1188 let next_byte = bytes[pos + 1];
1189 // Whitespace, delimiters, operators - optimized check
1190 next_byte <= b' '
1191 || matches!(
1192 next_byte,
1193 b';' | b','
1194 | b')'
1195 | b'}'
1196 | b']'
1197 | b'+'
1198 | b'-'
1199 | b'*'
1200 | b'/'
1201 | b'%'
1202 | b'='
1203 | b'<'
1204 | b'>'
1205 | b'!'
1206 | b'&'
1207 | b'|'
1208 | b'^'
1209 | b'~'
1210 | b'e'
1211 | b'E'
1212 )
1213 }
1214 };
1215
1216 if should_consume_dot {
1217 pos += 1; // consume the dot
1218 // Consume fractional digits - batch processing
1219 while pos < bytes.len() && (bytes[pos].is_ascii_digit() || bytes[pos] == b'_') {
1220 pos += 1;
1221 }
1222 self.position = pos;
1223 }
1224 }
1225
1226 // Check for exponent - optimized
1227 if pos < bytes.len() && (bytes[pos] == b'e' || bytes[pos] == b'E') {
1228 let exp_start = pos;
1229 pos += 1; // consume 'e' or 'E'
1230
1231 // Check for optional sign
1232 if pos < bytes.len() && (bytes[pos] == b'+' || bytes[pos] == b'-') {
1233 pos += 1;
1234 }
1235
1236 // Must have at least one digit after exponent (underscores allowed between digits)
1237 let mut saw_digit = false;
1238 while pos < bytes.len() {
1239 let byte = bytes[pos];
1240 if byte.is_ascii_digit() {
1241 saw_digit = true;
1242 pos += 1;
1243 } else if byte == b'_' {
1244 pos += 1;
1245 } else {
1246 break;
1247 }
1248 }
1249
1250 // If no digits after exponent, backtrack
1251 if !saw_digit {
1252 pos = exp_start;
1253 }
1254
1255 self.position = pos;
1256 }
1257
1258 // Avoid string slicing for common number cases - use Arc::from directly on slice
1259 let text = &self.input[start..self.position];
1260 self.mode = LexerMode::ExpectOperator;
1261
1262 Some(Token {
1263 token_type: TokenType::Number(Arc::from(text)),
1264 text: Arc::from(text),
1265 start,
1266 end: self.position,
1267 })
1268 }
1269
1270 fn parse_decimal_number(&mut self, start: usize) -> Option<Token> {
1271 // We're at the dot, consume it
1272 self.advance();
1273
1274 // Parse the fractional part
1275 while self.position < self.input_bytes.len() {
1276 let byte = self.input_bytes[self.position];
1277 match byte {
1278 b'0'..=b'9' | b'_' => self.position += 1,
1279 b'e' | b'E' => {
1280 // Handle scientific notation
1281 self.advance();
1282 if self.position < self.input_bytes.len() {
1283 let next = self.input_bytes[self.position];
1284 if next == b'+' || next == b'-' {
1285 self.advance();
1286 }
1287 }
1288 // Parse exponent digits (underscores allowed between digits)
1289 let exponent_start = self.position;
1290 let mut saw_digit = false;
1291 while self.position < self.input_bytes.len() {
1292 let byte = self.input_bytes[self.position];
1293 if byte.is_ascii_digit() {
1294 saw_digit = true;
1295 self.position += 1;
1296 } else if byte == b'_' {
1297 self.position += 1;
1298 } else {
1299 break;
1300 }
1301 }
1302
1303 // No digits after exponent marker, rewind so caller treats `e` as separate token.
1304 if !saw_digit {
1305 self.position = exponent_start.saturating_sub(1);
1306 }
1307 break;
1308 }
1309 _ => break,
1310 }
1311 }
1312
1313 let text = &self.input[start..self.position];
1314 self.mode = LexerMode::ExpectOperator;
1315
1316 Some(Token {
1317 token_type: TokenType::Number(Arc::from(text)),
1318 text: Arc::from(text),
1319 start,
1320 end: self.position,
1321 })
1322 }
1323
1324 fn try_variable(&mut self) -> Option<Token> {
1325 let start = self.position;
1326 let sigil = self.current_char()?;
1327
1328 match sigil {
1329 '$' | '@' | '%' | '*' => {
1330 // In ExpectOperator mode, treat % and * as operators rather than sigils
1331 if self.mode == LexerMode::ExpectOperator && matches!(sigil, '*' | '%') {
1332 return None;
1333 }
1334 self.advance();
1335
1336 // Special case: After ->, sigils followed by { or [ should be tokenized separately
1337 // This is for postfix dereference like ->@*, ->%{}, ->@[]
1338 // We need to be careful with Unicode - check if we have enough bytes and valid char boundaries
1339 let check_arrow = self.position >= 3
1340 && self.position.saturating_sub(1) <= self.input.len()
1341 && self.input.is_char_boundary(self.position.saturating_sub(3))
1342 && self.input.is_char_boundary(self.position.saturating_sub(1));
1343
1344 if check_arrow
1345 && {
1346 let saved = self.position;
1347 self.position -= 3;
1348 let arrow = self.matches_bytes(b"->");
1349 self.position = saved;
1350 arrow
1351 }
1352 && matches!(self.current_char(), Some('{' | '[' | '*'))
1353 {
1354 // Just return the sigil
1355 let text = &self.input[start..self.position];
1356 self.mode = LexerMode::ExpectOperator;
1357
1358 return Some(Token {
1359 token_type: TokenType::Identifier(Arc::from(text)),
1360 text: Arc::from(text),
1361 start,
1362 end: self.position,
1363 });
1364 }
1365
1366 // Check for $# (array length operator)
1367 if sigil == '$' && self.current_char() == Some('#') {
1368 self.advance(); // consume #
1369 // Now parse the array name
1370 while let Some(ch) = self.current_char() {
1371 if is_perl_identifier_continue(ch) {
1372 self.advance();
1373 } else if ch == ':' && self.peek_char(1) == Some(':') {
1374 // Package-qualified array name
1375 self.advance();
1376 self.advance();
1377 } else {
1378 break;
1379 }
1380 }
1381
1382 let text = &self.input[start..self.position];
1383 self.mode = LexerMode::ExpectOperator;
1384
1385 return Some(Token {
1386 token_type: TokenType::Identifier(Arc::from(text)),
1387 text: Arc::from(text),
1388 start,
1389 end: self.position,
1390 });
1391 }
1392
1393 // Check for special cases like ${^MATCH} or ${::{foo}} or *{$glob}
1394 if self.current_char() == Some('{') {
1395 // Peek ahead to decide if we should consume the brace
1396 let next_char = self.peek_char(1);
1397
1398 // Check if this is a dereference like @{$ref} or @{[...]}
1399 // If the next char suggests dereference, don't consume the brace
1400 if sigil != '*'
1401 && matches!(
1402 next_char,
1403 Some('$' | '@' | '%' | '*' | '&' | '[' | ' ' | '\t' | '\n' | '\r')
1404 )
1405 {
1406 // This is a dereference, don't consume the brace
1407 let text = &self.input[start..self.position];
1408 self.mode = LexerMode::ExpectOperator;
1409
1410 return Some(Token {
1411 token_type: TokenType::Identifier(Arc::from(text)),
1412 text: Arc::from(text),
1413 start,
1414 end: self.position,
1415 });
1416 }
1417
1418 self.advance(); // consume {
1419
1420 // Handle special variables with caret
1421 if self.current_char() == Some('^') {
1422 self.advance(); // consume ^
1423 // Parse the special variable name
1424 while let Some(ch) = self.current_char() {
1425 if ch == '}' {
1426 self.advance(); // consume }
1427 break;
1428 } else if is_perl_identifier_continue(ch) {
1429 self.advance();
1430 } else {
1431 break;
1432 }
1433 }
1434 }
1435 // Handle stash access like $::{foo}
1436 else if self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1437 self.advance(); // consume first :
1438 self.advance(); // consume second :
1439 // Skip optional { and }
1440 if self.current_char() == Some('{') {
1441 self.advance();
1442 }
1443 // Parse the name
1444 while let Some(ch) = self.current_char() {
1445 if ch == '}' {
1446 self.advance();
1447 if self.current_char() == Some('}') {
1448 self.advance(); // consume closing } of ${...}
1449 }
1450 break;
1451 } else if is_perl_identifier_continue(ch) {
1452 self.advance();
1453 } else {
1454 break;
1455 }
1456 }
1457 }
1458 // Regular braced variable like ${foo} or glob like *{$glob}
1459 else {
1460 // Check if this is a dereference like ${$ref} or @{$ref} or @{[...]}
1461 // If the next char is a sigil or other expression starter, we should stop here and let the parser handle it
1462 // EXCEPT for globs - *{$glob} should be parsed as one token
1463 // Also check for empty braces or EOF - in these cases we should split the tokens
1464 if sigil != '*'
1465 && (matches!(
1466 self.current_char(),
1467 Some(
1468 '$' | '@'
1469 | '%'
1470 | '*'
1471 | '&'
1472 | '['
1473 | ' '
1474 | '\t'
1475 | '\n'
1476 | '\r'
1477 | '}'
1478 )
1479 ) || self.current_char().is_none())
1480 {
1481 // This is a dereference or empty/invalid brace, backtrack
1482 self.position = start + 1; // Just past the sigil
1483 let text = &self.input[start..self.position];
1484 self.mode = LexerMode::ExpectOperator;
1485
1486 return Some(Token {
1487 token_type: TokenType::Identifier(Arc::from(text)),
1488 text: Arc::from(text),
1489 start,
1490 end: self.position,
1491 });
1492 }
1493
1494 // For glob access, we need to consume everything inside braces
1495 if sigil == '*' {
1496 let mut brace_depth: usize = 1;
1497 while let Some(ch) = self.current_char() {
1498 if ch == '{' {
1499 brace_depth += 1;
1500 } else if ch == '}' {
1501 brace_depth = brace_depth.saturating_sub(1);
1502 if brace_depth == 0 {
1503 self.advance(); // consume final }
1504 break;
1505 }
1506 }
1507 self.advance();
1508 }
1509 } else {
1510 // Regular variable
1511 while let Some(ch) = self.current_char() {
1512 if ch == '}' {
1513 self.advance(); // consume }
1514 break;
1515 } else if is_perl_identifier_continue(ch) {
1516 self.advance();
1517 } else {
1518 break;
1519 }
1520 }
1521 }
1522 }
1523 }
1524 // Parse regular variable name
1525 else if let Some(ch) = self.current_char() {
1526 if is_perl_identifier_start(ch) {
1527 while let Some(ch) = self.current_char() {
1528 if is_perl_identifier_continue(ch) {
1529 self.advance();
1530 } else {
1531 break;
1532 }
1533 }
1534 // Handle package-qualified segments like Foo::bar
1535 while self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1536 self.advance();
1537 self.advance();
1538 while let Some(ch) = self.current_char() {
1539 if is_perl_identifier_continue(ch) {
1540 self.advance();
1541 } else {
1542 break;
1543 }
1544 }
1545 }
1546 }
1547 // Handle special punctuation variables
1548 else if sigil == '$'
1549 && matches!(
1550 ch,
1551 '?' | '!'
1552 | '@'
1553 | '&'
1554 | '`'
1555 | '\''
1556 | '.'
1557 | '/'
1558 | '\\'
1559 | '|'
1560 | '+'
1561 | '-'
1562 | '['
1563 | ']'
1564 | '$'
1565 )
1566 {
1567 self.advance(); // consume the special character
1568 }
1569 // Handle special array/hash punctuation variables
1570 else if (sigil == '@' || sigil == '%') && matches!(ch, '+' | '-') {
1571 self.advance(); // consume the + or -
1572 }
1573 }
1574
1575 let text = &self.input[start..self.position];
1576 self.mode = LexerMode::ExpectOperator;
1577
1578 Some(Token {
1579 token_type: TokenType::Identifier(Arc::from(text)),
1580 text: Arc::from(text),
1581 start,
1582 end: self.position,
1583 })
1584 }
1585 _ => None,
1586 }
1587 }
1588
1589 /// Return next non-space char without consuming.
1590 fn peek_nonspace(&self) -> Option<char> {
1591 let mut i = self.position;
1592 while i < self.input.len() {
1593 let c = self.input.get(i..).and_then(|s| s.chars().next())?;
1594 if c.is_whitespace() {
1595 i += c.len_utf8();
1596 continue;
1597 }
1598 return Some(c);
1599 }
1600 None
1601 }
1602
1603 /// Is `c` a valid quote-like delimiter? (non-alnum, including paired)
1604 fn is_quote_delim(c: char) -> bool {
1605 // Quote delimiters are punctuation, but not whitespace or control characters
1606 !c.is_ascii_alphanumeric() && !c.is_whitespace() && !c.is_control()
1607 }
1608
1609 #[inline]
1610 fn try_identifier_or_keyword(&mut self) -> Option<Token> {
1611 let start = self.position;
1612 let ch = self.current_char()?;
1613
1614 if is_perl_identifier_start(ch) {
1615 // Special case: substitution/transliteration with single-quote delimiter
1616 // The single quote is considered an identifier continuation, so we need to
1617 // detect these operators before consuming it as part of an identifier.
1618 if ch == 's' && self.peek_char(1) == Some('\'') {
1619 self.advance(); // consume 's'
1620 return self.parse_substitution(start);
1621 } else if ch == 'y' && self.peek_char(1) == Some('\'') {
1622 self.advance(); // consume 'y'
1623 return self.parse_transliteration(start);
1624 } else if ch == 't' && self.peek_char(1) == Some('r') && self.peek_char(2) == Some('\'')
1625 {
1626 self.advance(); // consume 't'
1627 self.advance(); // consume 'r'
1628 return self.parse_transliteration(start);
1629 }
1630
1631 while let Some(ch) = self.current_char() {
1632 // Single quote is usually allowed inside Perl identifiers (legacy package separator),
1633 // but it can also be the delimiter for quote-like operators (q'..', qq'..', qr'..', m'..').
1634 // If we've already read one of those operator words, stop before consuming the quote
1635 // so the quote-operator path can handle it.
1636 if ch == '\''
1637 && matches!(
1638 &self.input[start..self.position],
1639 "m" | "q" | "qq" | "qw" | "qx" | "qr"
1640 )
1641 {
1642 break;
1643 }
1644
1645 if is_perl_identifier_continue(ch) {
1646 self.advance();
1647 } else {
1648 break;
1649 }
1650 }
1651 // Handle package-qualified identifiers like Foo::bar
1652 while self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
1653 // consume '::'
1654 self.advance();
1655 self.advance();
1656
1657 // consume following identifier segment if present
1658 if let Some(ch) = self.current_char()
1659 && is_perl_identifier_start(ch)
1660 {
1661 self.advance();
1662 while let Some(ch) = self.current_char() {
1663 if is_perl_identifier_continue(ch) {
1664 self.advance();
1665 } else {
1666 break;
1667 }
1668 }
1669 }
1670 }
1671
1672 let text = &self.input[start..self.position];
1673
1674 // Check for __DATA__ and __END__ markers using exact match
1675 // Only recognize these in code channel, not inside data/format sections or heredocs
1676 let in_code_channel =
1677 !matches!(self.mode, LexerMode::InDataSection | LexerMode::InFormatBody)
1678 && self.pending_heredocs.is_empty();
1679
1680 let marker = if in_code_channel {
1681 if text == "__DATA__" {
1682 Some("__DATA__")
1683 } else if text == "__END__" {
1684 Some("__END__")
1685 } else {
1686 None
1687 }
1688 } else {
1689 None
1690 };
1691
1692 if let Some(marker_text) = marker {
1693 // These must be at the beginning of a line
1694 // Use the after_newline flag to determine if we're at line start
1695 if self.after_newline {
1696 // Check if rest of line is only whitespace
1697 // Only treat as data marker if line has no trailing junk
1698 if Self::trailing_ws_only(self.input_bytes, self.position) {
1699 // Consume the rest of the line (the marker line)
1700 while self.position < self.input.len()
1701 && self.input_bytes[self.position] != b'\n'
1702 {
1703 self.advance();
1704 }
1705 if self.position < self.input.len()
1706 && self.input_bytes[self.position] == b'\n'
1707 {
1708 self.advance();
1709 }
1710
1711 // Switch to data section mode
1712 self.mode = LexerMode::InDataSection;
1713
1714 return Some(Token {
1715 token_type: TokenType::DataMarker(Arc::from(marker_text)),
1716 text: Arc::from(marker_text),
1717 start,
1718 end: self.position,
1719 });
1720 }
1721 }
1722 }
1723
1724 // Check for substitution/transliteration operators
1725 #[allow(clippy::collapsible_if)]
1726 if matches!(text, "s" | "tr" | "y") {
1727 if let Some(next) = self.current_char() {
1728 // Check if followed by a delimiter
1729 if matches!(
1730 next,
1731 '/' | '|'
1732 | '\''
1733 | '{'
1734 | '['
1735 | '('
1736 | '<'
1737 | '!'
1738 | '#'
1739 | '@'
1740 | '$'
1741 | '%'
1742 | '^'
1743 | '&'
1744 | '*'
1745 | '+'
1746 | '='
1747 | '~'
1748 | '`'
1749 ) {
1750 match text {
1751 "s" => {
1752 return self.parse_substitution(start);
1753 }
1754 "tr" | "y" => {
1755 return self.parse_transliteration(start);
1756 }
1757 unexpected => {
1758 // Return diagnostic token instead of panicking
1759 return Some(Token {
1760 token_type: TokenType::Error(Arc::from(format!(
1761 "Unexpected substitution operator '{}': expected 's', 'tr', or 'y' at position {}",
1762 unexpected, start
1763 ))),
1764 text: Arc::from(unexpected),
1765 start,
1766 end: self.position,
1767 });
1768 }
1769 }
1770 }
1771 }
1772 }
1773
1774 let token_type = if is_keyword(text) {
1775 // Check for special keywords that affect lexer mode
1776 match text {
1777 "if" | "unless" | "while" | "until" | "for" | "foreach" | "grep" | "map"
1778 | "sort" | "split" => {
1779 self.mode = LexerMode::ExpectTerm;
1780 }
1781 "sub" => {
1782 self.in_prototype = true;
1783 }
1784 // Quote operators expect a delimiter next (must be immediately adjacent)
1785 op if quote_handler::is_quote_operator(op) => {
1786 // For regex operators like 'm', 's', 'tr', 'y', delimiter must be immediately adjacent
1787 // For quote operators like 'q', 'qq', 'qw', 'qr', 'qx', we allow whitespace
1788 let next_char = if matches!(op, "m" | "s" | "tr" | "y") {
1789 self.current_char() // Must be immediately adjacent
1790 } else {
1791 self.peek_nonspace() // Can skip whitespace
1792 };
1793
1794 if let Some(next) = next_char {
1795 if Self::is_quote_delim(next) {
1796 self.mode = LexerMode::ExpectDelimiter;
1797 self.current_quote_op = Some(quote_handler::QuoteOperatorInfo {
1798 operator: op.to_string(),
1799 delimiter: '\0', // Will be set when we see the delimiter
1800 start_pos: start,
1801 });
1802
1803 // Don't return a keyword token - continue to parse the delimiter
1804 // Skip any whitespace between operator and delimiter
1805 while let Some(ch) = self.current_char() {
1806 if ch.is_whitespace() {
1807 self.advance();
1808 } else {
1809 break;
1810 }
1811 }
1812
1813 // Get the delimiter
1814 #[allow(clippy::collapsible_if)]
1815 if let Some(delim) = self.current_char() {
1816 if !delim.is_alphanumeric() {
1817 self.advance();
1818 if let Some(ref mut info) = self.current_quote_op {
1819 info.delimiter = delim;
1820 }
1821 // Parse the quote operator content and return the complete token
1822 return self.parse_quote_operator(delim);
1823 }
1824 }
1825 } else {
1826 // Not a quote operator here → treat as IDENTIFIER
1827 self.current_quote_op = None;
1828 self.mode = LexerMode::ExpectOperator;
1829 return Some(Token {
1830 token_type: TokenType::Identifier(Arc::from(text)),
1831 start,
1832 end: self.position,
1833 text: Arc::from(text),
1834 });
1835 }
1836 } else {
1837 // End-of-input after the word → also treat as IDENTIFIER
1838 self.current_quote_op = None;
1839 self.mode = LexerMode::ExpectOperator;
1840 return Some(Token {
1841 token_type: TokenType::Identifier(Arc::from(text)),
1842 start,
1843 end: self.position,
1844 text: Arc::from(text),
1845 });
1846 }
1847 // If we get here but haven't returned, something went wrong
1848 // Fall through to treat as identifier
1849 self.current_quote_op = None;
1850 self.mode = LexerMode::ExpectOperator;
1851 return Some(Token {
1852 token_type: TokenType::Identifier(Arc::from(text)),
1853 start,
1854 end: self.position,
1855 text: Arc::from(text),
1856 });
1857 }
1858 // Format declarations need special handling
1859 "format" => {
1860 // We'll need to check for the = after the format name
1861 // For now, just mark that we saw format
1862 }
1863 _ => {}
1864 }
1865 TokenType::Keyword(Arc::from(text))
1866 } else {
1867 self.mode = LexerMode::ExpectOperator;
1868 TokenType::Identifier(Arc::from(text))
1869 };
1870
1871 Some(Token { token_type, text: Arc::from(text), start, end: self.position })
1872 } else {
1873 None
1874 }
1875 }
1876
1877 /// Parse data section body - consumes everything to EOF
1878 fn parse_data_body(&mut self) -> Option<Token> {
1879 if self.position >= self.input.len() {
1880 // Already at EOF
1881 self.mode = LexerMode::ExpectTerm;
1882 return Some(Token {
1883 token_type: TokenType::EOF,
1884 text: Arc::from(""),
1885 start: self.position,
1886 end: self.position,
1887 });
1888 }
1889
1890 let start = self.position;
1891 // Consume everything to EOF
1892 let body = &self.input[self.position..];
1893 self.position = self.input.len();
1894
1895 // Reset mode for next parse (though we're at EOF)
1896 self.mode = LexerMode::ExpectTerm;
1897
1898 Some(Token {
1899 token_type: TokenType::DataBody(Arc::from(body)),
1900 text: Arc::from(body),
1901 start,
1902 end: self.position,
1903 })
1904 }
1905
1906 /// Parse format body - consumes until a line with just a dot
1907 fn parse_format_body(&mut self) -> Option<Token> {
1908 let start = self.position;
1909 let mut body = String::new();
1910 let mut line_start = true;
1911
1912 while self.position < self.input.len() {
1913 // Check if we're at the start of a line and the next char is a dot
1914 if line_start && self.current_char() == Some('.') {
1915 // Check if this line contains only a dot
1916 let mut peek_pos = self.position + 1;
1917 let mut found_terminator = true;
1918
1919 // Skip any trailing whitespace on the dot line
1920 while peek_pos < self.input.len() {
1921 match self.input_bytes[peek_pos] {
1922 b' ' | b'\t' | b'\r' => peek_pos += 1,
1923 b'\n' => break,
1924 _ => {
1925 found_terminator = false;
1926 break;
1927 }
1928 }
1929 }
1930
1931 if found_terminator {
1932 // We found the terminating dot, consume it
1933 self.position = peek_pos;
1934 if self.position < self.input.len() && self.input_bytes[self.position] == b'\n'
1935 {
1936 self.position += 1;
1937 }
1938
1939 // Switch back to normal mode
1940 self.mode = LexerMode::ExpectTerm;
1941
1942 return Some(Token {
1943 token_type: TokenType::FormatBody(Arc::from(body.clone())),
1944 text: Arc::from(body),
1945 start,
1946 end: self.position,
1947 });
1948 }
1949 }
1950
1951 // Not a terminator, consume the character
1952 match self.current_char() {
1953 Some(ch) => {
1954 body.push(ch);
1955 self.advance();
1956
1957 // Track if we're at the start of a line
1958 line_start = ch == '\n';
1959 }
1960 None => {
1961 // Reached EOF without finding terminator
1962 break;
1963 }
1964 }
1965 }
1966
1967 // If we reach here, we didn't find a terminator
1968 self.mode = LexerMode::ExpectTerm;
1969 Some(Token {
1970 token_type: TokenType::Error(Arc::from("Unterminated format body")),
1971 text: Arc::from(body),
1972 start,
1973 end: self.position,
1974 })
1975 }
1976
1977 fn try_operator(&mut self) -> Option<Token> {
1978 // Skip operator parsing if we're expecting a delimiter for a quote operator
1979 if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
1980 return None;
1981 }
1982
1983 let start = self.position;
1984 let ch = self.current_char()?;
1985
1986 // ═══════════════════════════════════════════════════════════════════════
1987 // SLASH DISAMBIGUATION STRATEGY (Issue #422)
1988 // ═══════════════════════════════════════════════════════════════════════
1989 //
1990 // Perl's `/` character is ambiguous:
1991 // - Division operator: `$x / 2`
1992 // - Regex delimiter: `/pattern/`
1993 // - Defined-or operator: `$x // $y`
1994 //
1995 // **Disambiguation Strategy (Context-Aware Heuristics):**
1996 //
1997 // 1. **Mode-Based Decision (Primary)**:
1998 // - `LexerMode::ExpectTerm` → `/` starts a regex
1999 // Examples: `if (/pattern/)`, `=~ /test/`, `( /regex/`
2000 // - `LexerMode::ExpectOperator` → `/` is division or `//`
2001 // Examples: `$x / 2`, `$x // $y`, `) / 3`
2002 //
2003 // 2. **Context Heuristics (Secondary - Implicit in Mode)**:
2004 // Mode is set based on previous token:
2005 // - After identifier/number/closing paren → ExpectOperator → division
2006 // - After operator/keyword/opening paren → ExpectTerm → regex
2007 //
2008 // 3. **Timeout Protection**:
2009 // - Regex parsing has budget guard: MAX_REGEX_BYTES (64KB)
2010 // - Budget exceeded → emit UnknownRest token (graceful degradation)
2011 // - See `parse_regex()` and `budget_guard()` for implementation
2012 //
2013 // 4. **Performance Characteristics**:
2014 // - Single-pass: O(1) decision based on mode flag
2015 // - No backtracking: Mode updated after each token
2016 // - Optimized: Byte-level operations for common cases
2017 //
2018 // **Metrics & Monitoring**:
2019 // - Budget exceeded events tracked via UnknownRest token emission
2020 // - LSP diagnostics generated for truncated regexes
2021 // - Test coverage: lexer_slash_timeout_tests.rs (21 test cases)
2022 //
2023 // ═══════════════════════════════════════════════════════════════════════
2024
2025 if ch == '/' {
2026 if self.mode == LexerMode::ExpectTerm {
2027 // Mode indicates we're expecting a term → `/` starts a regex
2028 // Examples: `if (/pattern/)`, `=~ /test/`, `while (/match/)`
2029 return self.parse_regex(start);
2030 } else {
2031 // Mode indicates we're expecting an operator → `/` is division or `//`
2032 // Examples: `$x / 2`, `$x // $y`, `10 / 3`
2033 self.advance();
2034 // Check for // or //= using byte-level operations for speed
2035 if self.peek_byte(0) == Some(b'/') {
2036 self.position += 1; // consume second / directly
2037 if self.peek_byte(0) == Some(b'=') {
2038 self.position += 1; // consume = directly
2039 let text = &self.input[start..self.position];
2040 self.mode = LexerMode::ExpectTerm;
2041 return Some(Token {
2042 token_type: TokenType::Operator(Arc::from(text)),
2043 text: Arc::from(text),
2044 start,
2045 end: self.position,
2046 });
2047 } else {
2048 // Use cached string for common "//" operator
2049 self.mode = LexerMode::ExpectTerm;
2050 return Some(Token {
2051 token_type: TokenType::Operator(Arc::from("//")),
2052 text: Arc::from("//"),
2053 start,
2054 end: self.position,
2055 });
2056 }
2057 } else if self.position < self.input_bytes.len()
2058 && self.input_bytes[self.position] == b'='
2059 {
2060 // /= division-assign operator
2061 self.position += 1; // consume =
2062 self.mode = LexerMode::ExpectTerm;
2063 return Some(Token {
2064 token_type: TokenType::Operator(Arc::from("/=")),
2065 text: Arc::from("/="),
2066 start,
2067 end: self.position,
2068 });
2069 } else {
2070 // Use cached string for common "/" division
2071 self.mode = LexerMode::ExpectTerm;
2072 return Some(Token {
2073 token_type: TokenType::Division,
2074 text: Arc::from("/"),
2075 start,
2076 end: self.position,
2077 });
2078 }
2079 }
2080 }
2081
2082 // Handle other operators - simplified
2083 match ch {
2084 '.' => {
2085 // Check if it's a decimal number like .5
2086 if self.peek_char(1).is_some_and(|c| c.is_ascii_digit()) {
2087 return self.parse_decimal_number(start);
2088 }
2089 self.advance();
2090 // Check for compound operators
2091 #[allow(clippy::collapsible_if)]
2092 if let Some(next) = self.current_char() {
2093 if is_compound_operator(ch, next) {
2094 self.advance();
2095
2096 // Check for three-character operators like **=, <<=, >>=
2097 if self.position < self.input.len() {
2098 let third = self.current_char();
2099 // Check for three-character operators
2100 if matches!(
2101 (ch, next, third),
2102 ('*', '*', Some('='))
2103 | ('<', '<', Some('='))
2104 | ('>', '>', Some('='))
2105 | ('&', '&', Some('='))
2106 | ('|', '|', Some('='))
2107 | ('/', '/', Some('='))
2108 ) {
2109 self.advance(); // consume the =
2110 } else if ch == '<' && next == '=' && third == Some('>') {
2111 self.advance(); // consume the >
2112 // Special case: <=> spaceship operator
2113 } else if ch == '.' && next == '.' && third == Some('.') {
2114 self.advance(); // consume the third .
2115 }
2116 }
2117 }
2118 }
2119 }
2120 '+' | '-' | '*' | '%' | '&' | '|' | '^' | '~' | '!' | '=' | '<' | '>' | ':' | '?'
2121 | '\\' => {
2122 self.advance();
2123 // Check for compound operators
2124 #[allow(clippy::collapsible_if)]
2125 if let Some(next) = self.current_char() {
2126 if is_compound_operator(ch, next) {
2127 self.advance();
2128
2129 // Check for three-character operators like **=, <<=, >>=
2130 if self.position < self.input.len() {
2131 let third = self.current_char();
2132 // Check for three-character operators
2133 if matches!(
2134 (ch, next, third),
2135 ('*', '*', Some('='))
2136 | ('<', '<', Some('='))
2137 | ('>', '>', Some('='))
2138 | ('&', '&', Some('='))
2139 | ('|', '|', Some('='))
2140 | ('/', '/', Some('='))
2141 ) {
2142 self.advance(); // consume the =
2143 } else if ch == '<' && next == '=' && third == Some('>') {
2144 self.advance(); // consume the >
2145 // Special case: <=> spaceship operator
2146 }
2147 }
2148 }
2149 }
2150 }
2151 _ => return None,
2152 }
2153
2154 let text = &self.input[start..self.position];
2155 // Postfix ++ and -- complete a term expression, so next token is an operator
2156 // (e.g., "$x++ / 2" → / is division, not regex)
2157 if (text == "++" || text == "--") && self.mode == LexerMode::ExpectOperator {
2158 // Postfix: stay in ExpectOperator
2159 } else {
2160 self.mode = LexerMode::ExpectTerm;
2161 }
2162
2163 Some(Token {
2164 token_type: TokenType::Operator(Arc::from(text)),
2165 text: Arc::from(text),
2166 start,
2167 end: self.position,
2168 })
2169 }
2170
2171 fn try_delimiter(&mut self) -> Option<Token> {
2172 let start = self.position;
2173 let ch = self.current_char()?;
2174
2175 // If we're expecting a delimiter for a quote operator, handle it specially
2176 if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
2177 // Accept any non-alphanumeric character as a delimiter
2178 if !ch.is_alphanumeric() && !ch.is_whitespace() {
2179 self.advance();
2180 if let Some(ref mut info) = self.current_quote_op {
2181 info.delimiter = ch;
2182 }
2183 // Now parse the quote operator content
2184 return self.parse_quote_operator(ch);
2185 }
2186 }
2187
2188 match ch {
2189 '(' => {
2190 // Check if this is a quote operator delimiter
2191 if matches!(self.mode, LexerMode::ExpectDelimiter)
2192 && self.current_quote_op.is_some()
2193 {
2194 self.advance();
2195 if let Some(ref mut info) = self.current_quote_op {
2196 info.delimiter = ch;
2197 }
2198 return self.parse_quote_operator(ch);
2199 }
2200
2201 self.advance();
2202 if self.in_prototype {
2203 self.prototype_depth += 1;
2204 }
2205 self.mode = LexerMode::ExpectTerm;
2206 Some(Token {
2207 token_type: TokenType::LeftParen,
2208 text: Arc::from("("),
2209 start,
2210 end: self.position,
2211 })
2212 }
2213 ')' => {
2214 self.advance();
2215 if self.in_prototype && self.prototype_depth > 0 {
2216 self.prototype_depth -= 1;
2217 if self.prototype_depth == 0 {
2218 self.in_prototype = false;
2219 }
2220 }
2221 self.mode = LexerMode::ExpectOperator;
2222 Some(Token {
2223 token_type: TokenType::RightParen,
2224 text: Arc::from(")"),
2225 start,
2226 end: self.position,
2227 })
2228 }
2229 ';' => {
2230 self.advance();
2231 self.mode = LexerMode::ExpectTerm;
2232 Some(Token {
2233 token_type: TokenType::Semicolon,
2234 text: Arc::from(";"),
2235 start,
2236 end: self.position,
2237 })
2238 }
2239 ',' => {
2240 self.advance();
2241 self.mode = LexerMode::ExpectTerm;
2242 Some(Token {
2243 token_type: TokenType::Comma,
2244 text: Arc::from(","),
2245 start,
2246 end: self.position,
2247 })
2248 }
2249 '[' => {
2250 self.advance();
2251 self.mode = LexerMode::ExpectTerm;
2252 Some(Token {
2253 token_type: TokenType::LeftBracket,
2254 text: Arc::from("["),
2255 start,
2256 end: self.position,
2257 })
2258 }
2259 ']' => {
2260 self.advance();
2261 self.mode = LexerMode::ExpectOperator;
2262 Some(Token {
2263 token_type: TokenType::RightBracket,
2264 text: Arc::from("]"),
2265 start,
2266 end: self.position,
2267 })
2268 }
2269 '{' => {
2270 self.advance();
2271 self.mode = LexerMode::ExpectTerm;
2272 Some(Token {
2273 token_type: TokenType::LeftBrace,
2274 text: Arc::from("{"),
2275 start,
2276 end: self.position,
2277 })
2278 }
2279 '}' => {
2280 self.advance();
2281 self.mode = LexerMode::ExpectOperator;
2282 Some(Token {
2283 token_type: TokenType::RightBrace,
2284 text: Arc::from("}"),
2285 start,
2286 end: self.position,
2287 })
2288 }
2289 '#' => {
2290 // Only treat as delimiter in ExpectDelimiter mode
2291 if matches!(self.mode, LexerMode::ExpectDelimiter) {
2292 self.advance();
2293 // Reset mode after consuming delimiter
2294 self.mode = LexerMode::ExpectTerm;
2295 Some(Token {
2296 token_type: TokenType::Operator(Arc::from("#")),
2297 text: Arc::from("#"),
2298 start,
2299 end: self.position,
2300 })
2301 } else {
2302 None
2303 }
2304 }
2305 _ => None,
2306 }
2307 }
2308
2309 fn parse_double_quoted_string(&mut self, start: usize) -> Option<Token> {
2310 self.advance(); // Skip opening quote
2311 let mut parts = Vec::new();
2312 let mut current_literal = String::new();
2313 let mut last_pos = self.position;
2314
2315 while let Some(ch) = self.current_char() {
2316 match ch {
2317 '"' => {
2318 self.advance();
2319 if !current_literal.is_empty() {
2320 parts.push(StringPart::Literal(Arc::from(current_literal)));
2321 }
2322
2323 let text = &self.input[start..self.position];
2324 self.mode = LexerMode::ExpectOperator;
2325
2326 return Some(Token {
2327 token_type: if parts.is_empty() {
2328 TokenType::StringLiteral
2329 } else {
2330 TokenType::InterpolatedString(parts)
2331 },
2332 text: Arc::from(text),
2333 start,
2334 end: self.position,
2335 });
2336 }
2337 '\\' => {
2338 self.advance();
2339 if let Some(escaped) = self.current_char() {
2340 // Optimize by reserving space to avoid frequent reallocations
2341 if current_literal.capacity() == 0 {
2342 current_literal.reserve(32);
2343 }
2344 current_literal.push('\\');
2345 current_literal.push(escaped);
2346 self.advance();
2347 }
2348 }
2349 '$' if self.config.parse_interpolation => {
2350 // Handle variable interpolation - avoid unnecessary clone
2351 if !current_literal.is_empty() {
2352 parts.push(StringPart::Literal(Arc::from(current_literal)));
2353 current_literal = String::new(); // Clear without cloning
2354 }
2355
2356 // Parse variable - optimized using byte-level checks where possible
2357 self.advance();
2358 let var_start = self.position;
2359
2360 // Fast path for ASCII identifier continuation
2361 while self.position < self.input_bytes.len() {
2362 let byte = self.input_bytes[self.position];
2363 if byte.is_ascii_alphanumeric() || byte == b'_' {
2364 self.position += 1;
2365 } else if byte >= 128 {
2366 // Only use UTF-8 parsing for non-ASCII
2367 if let Some(ch) = self.current_char() {
2368 if is_perl_identifier_continue(ch) {
2369 self.advance();
2370 } else {
2371 break;
2372 }
2373 } else {
2374 break;
2375 }
2376 } else {
2377 break;
2378 }
2379 }
2380
2381 if self.position > var_start {
2382 let var_name = &self.input[var_start - 1..self.position];
2383 parts.push(StringPart::Variable(Arc::from(var_name)));
2384 }
2385 }
2386 _ => {
2387 // Optimize string building with better capacity management
2388 if current_literal.capacity() == 0 {
2389 current_literal.reserve(32);
2390 }
2391 current_literal.push(ch);
2392 self.advance();
2393 }
2394 }
2395
2396 // Safety check: ensure we're making progress
2397 if self.position == last_pos {
2398 break;
2399 }
2400 last_pos = self.position;
2401 }
2402
2403 // Unterminated string - return error token consuming rest of input
2404 let end = self.input.len();
2405 self.position = end;
2406
2407 Some(Token {
2408 token_type: TokenType::Error(Arc::from("unterminated string")),
2409 text: Arc::from(&self.input[start..end]),
2410 start,
2411 end,
2412 })
2413 }
2414
2415 fn parse_single_quoted_string(&mut self, start: usize) -> Option<Token> {
2416 self.advance(); // Skip opening quote
2417
2418 let mut last_pos = self.position;
2419
2420 while let Some(ch) = self.current_char() {
2421 match ch {
2422 '\'' => {
2423 self.advance();
2424 let text = &self.input[start..self.position];
2425 self.mode = LexerMode::ExpectOperator;
2426
2427 return Some(Token {
2428 token_type: TokenType::StringLiteral,
2429 text: Arc::from(text),
2430 start,
2431 end: self.position,
2432 });
2433 }
2434 '\\' => {
2435 self.advance();
2436 if self.current_char() == Some('\'') || self.current_char() == Some('\\') {
2437 self.advance();
2438 }
2439 }
2440 _ => self.advance(),
2441 }
2442
2443 // Safety check: ensure we're making progress
2444 if self.position == last_pos {
2445 break;
2446 }
2447 last_pos = self.position;
2448 }
2449
2450 // Unterminated string - return error token consuming rest of input
2451 let end = self.input.len();
2452 self.position = end;
2453
2454 Some(Token {
2455 token_type: TokenType::Error(Arc::from("unterminated string")),
2456 text: Arc::from(&self.input[start..end]),
2457 start,
2458 end,
2459 })
2460 }
2461
2462 fn parse_backtick_string(&mut self, start: usize) -> Option<Token> {
2463 self.advance(); // Skip opening backtick
2464
2465 let mut last_pos = self.position;
2466
2467 while let Some(ch) = self.current_char() {
2468 match ch {
2469 '`' => {
2470 self.advance();
2471 let text = &self.input[start..self.position];
2472 self.mode = LexerMode::ExpectOperator;
2473
2474 return Some(Token {
2475 token_type: TokenType::QuoteCommand,
2476 text: Arc::from(text),
2477 start,
2478 end: self.position,
2479 });
2480 }
2481 '\\' => {
2482 self.advance();
2483 if self.current_char().is_some() {
2484 self.advance();
2485 }
2486 }
2487 _ => self.advance(),
2488 }
2489
2490 // Safety check: ensure we're making progress
2491 if self.position == last_pos {
2492 break;
2493 }
2494 last_pos = self.position;
2495 }
2496
2497 // Unterminated string - return error token consuming rest of input
2498 let end = self.input.len();
2499 self.position = end;
2500
2501 Some(Token {
2502 token_type: TokenType::Error(Arc::from("unterminated string")),
2503 text: Arc::from(&self.input[start..end]),
2504 start,
2505 end,
2506 })
2507 }
2508
2509 fn parse_q_string(&mut self, _start: usize) -> Option<Token> {
2510 // Simplified q-string parsing
2511 None
2512 }
2513
2514 /// Returns the closing delimiter for paired delimiters, or the same character for non-paired.
2515 /// This helper makes delimiter pairing explicit and avoids unreachable code paths.
2516 fn paired_closing(delim: char) -> char {
2517 match delim {
2518 '{' => '}',
2519 '[' => ']',
2520 '(' => ')',
2521 '<' => '>',
2522 _ => delim, // non-paired delimiters use the same character
2523 }
2524 }
2525
2526 fn parse_substitution(&mut self, start: usize) -> Option<Token> {
2527 // We've already consumed 's'
2528 let delimiter = self.current_char()?;
2529 self.advance(); // Skip delimiter
2530
2531 // Parse pattern
2532 let mut depth = 1;
2533 let is_paired = matches!(delimiter, '{' | '[' | '(' | '<');
2534 let closing = Self::paired_closing(delimiter);
2535
2536 while let Some(ch) = self.current_char() {
2537 // Check budget
2538 if let Some(token) = self.budget_guard(start, depth) {
2539 return Some(token);
2540 }
2541
2542 match ch {
2543 '\\' => {
2544 self.advance();
2545 if self.current_char().is_some() {
2546 self.advance();
2547 }
2548 }
2549 _ if ch == delimiter && is_paired => {
2550 depth += 1;
2551 self.advance();
2552 }
2553 _ if ch == closing => {
2554 self.advance();
2555 if is_paired {
2556 depth = depth.saturating_sub(1);
2557 if depth == 0 {
2558 break;
2559 }
2560 } else {
2561 break;
2562 }
2563 }
2564 _ => self.advance(),
2565 }
2566 }
2567
2568 // Parse replacement - may use different delimiter for paired patterns (e.g., s[foo]{bar})
2569 // MUT_002 fix: Detect the actual replacement delimiter instead of assuming same as pattern
2570 // Note: Pattern scanning is complete at this point; we use a separate repl_depth for replacement
2571 let (repl_delimiter, repl_closing, repl_is_paired) = if is_paired {
2572 // Skip whitespace between pattern and replacement for paired delimiters
2573 while let Some(ch) = self.current_char() {
2574 if ch.is_whitespace() {
2575 self.advance();
2576 } else {
2577 break;
2578 }
2579 }
2580
2581 // Detect replacement delimiter - may be different from pattern delimiter
2582 if let Some(repl_delim) = self.current_char() {
2583 if matches!(repl_delim, '{' | '[' | '(' | '<') {
2584 let repl_close = Self::paired_closing(repl_delim);
2585 self.advance();
2586 (repl_delim, repl_close, true)
2587 } else {
2588 // Non-paired replacement after paired pattern (unusual but valid)
2589 self.advance();
2590 (repl_delim, repl_delim, false)
2591 }
2592 } else {
2593 // End of input - return what we have
2594 (delimiter, closing, is_paired)
2595 }
2596 } else {
2597 // Non-paired delimiter - replacement uses same delimiter
2598 (delimiter, closing, false)
2599 };
2600
2601 // Use separate depth counter for replacement to avoid confusion with pattern depth
2602 let mut repl_depth: usize = 1;
2603 while let Some(ch) = self.current_char() {
2604 match ch {
2605 '\\' => {
2606 self.advance();
2607 if self.current_char().is_some() {
2608 self.advance();
2609 }
2610 }
2611 _ if ch == repl_delimiter && repl_is_paired => {
2612 repl_depth += 1;
2613 self.advance();
2614 }
2615 _ if ch == repl_closing => {
2616 self.advance();
2617 if repl_is_paired {
2618 repl_depth = repl_depth.saturating_sub(1);
2619 if repl_depth == 0 {
2620 break;
2621 }
2622 } else {
2623 break;
2624 }
2625 }
2626 _ => self.advance(),
2627 }
2628 }
2629
2630 // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
2631 while let Some(ch) = self.current_char() {
2632 if ch.is_ascii_alphanumeric() {
2633 self.advance();
2634 } else {
2635 break;
2636 }
2637 }
2638
2639 let text = &self.input[start..self.position];
2640 self.mode = LexerMode::ExpectOperator;
2641
2642 Some(Token {
2643 token_type: TokenType::Substitution,
2644 text: Arc::from(text),
2645 start,
2646 end: self.position,
2647 })
2648 }
2649
2650 fn parse_transliteration(&mut self, start: usize) -> Option<Token> {
2651 // We've already consumed 'tr' or 'y'
2652 let delimiter = self.current_char()?;
2653 self.advance(); // Skip delimiter
2654
2655 // Parse search list
2656 let mut depth = 1;
2657 let is_paired = matches!(delimiter, '{' | '[' | '(' | '<');
2658 let closing = Self::paired_closing(delimiter);
2659
2660 while let Some(ch) = self.current_char() {
2661 // Check budget
2662 if let Some(token) = self.budget_guard(start, depth) {
2663 return Some(token);
2664 }
2665
2666 match ch {
2667 '\\' => {
2668 self.advance();
2669 if self.current_char().is_some() {
2670 self.advance();
2671 }
2672 }
2673 _ if ch == delimiter && is_paired => {
2674 depth += 1;
2675 self.advance();
2676 }
2677 _ if ch == closing => {
2678 self.advance();
2679 if is_paired {
2680 depth = depth.saturating_sub(1);
2681 if depth == 0 {
2682 break;
2683 }
2684 } else {
2685 break;
2686 }
2687 }
2688 _ => self.advance(),
2689 }
2690 }
2691
2692 // Parse replacement list - same delimiter handling
2693 if is_paired {
2694 // Skip whitespace between search and replace for paired delimiters
2695 while let Some(ch) = self.current_char() {
2696 if ch.is_whitespace() {
2697 self.advance();
2698 } else {
2699 break;
2700 }
2701 }
2702
2703 // Expect opening delimiter for replacement
2704 if self.current_char() == Some(delimiter) {
2705 self.advance();
2706 depth = 1;
2707 }
2708 }
2709
2710 while let Some(ch) = self.current_char() {
2711 match ch {
2712 '\\' => {
2713 self.advance();
2714 if self.current_char().is_some() {
2715 self.advance();
2716 }
2717 }
2718 _ if ch == delimiter && is_paired => {
2719 depth += 1;
2720 self.advance();
2721 }
2722 _ if ch == closing => {
2723 self.advance();
2724 if is_paired {
2725 depth = depth.saturating_sub(1);
2726 if depth == 0 {
2727 break;
2728 }
2729 } else {
2730 break;
2731 }
2732 }
2733 _ => self.advance(),
2734 }
2735 }
2736
2737 // Parse modifiers - include all alphanumeric for proper validation in parser (MUT_005 fix)
2738 while let Some(ch) = self.current_char() {
2739 if ch.is_ascii_alphanumeric() {
2740 self.advance();
2741 } else {
2742 break;
2743 }
2744 }
2745
2746 let text = &self.input[start..self.position];
2747 self.mode = LexerMode::ExpectOperator;
2748
2749 Some(Token {
2750 token_type: TokenType::Transliteration,
2751 text: Arc::from(text),
2752 start,
2753 end: self.position,
2754 })
2755 }
2756
2757 /// Read content between delimiters
2758 fn read_delimited_body(&mut self, delim: char) -> String {
2759 let paired = quote_handler::paired_close(delim);
2760 let close = paired.unwrap_or(delim);
2761 let mut body = String::new();
2762 let mut depth = i32::from(paired.is_some());
2763
2764 while let Some(ch) = self.current_char() {
2765 if ch == '\\' {
2766 body.push(ch);
2767 self.advance();
2768 if let Some(next) = self.current_char() {
2769 body.push(next);
2770 self.advance();
2771 }
2772 continue;
2773 }
2774
2775 if paired.is_some() && ch == delim {
2776 body.push(ch);
2777 self.advance();
2778 depth += 1;
2779 continue;
2780 }
2781
2782 if ch == close {
2783 if paired.is_some() {
2784 depth -= 1;
2785 if depth == 0 {
2786 self.advance();
2787 break;
2788 }
2789 body.push(ch);
2790 self.advance();
2791 } else {
2792 self.advance();
2793 break;
2794 }
2795 continue;
2796 }
2797
2798 body.push(ch);
2799 self.advance();
2800 }
2801
2802 body
2803 }
2804
2805 /// Parse a quote operator after we've seen the delimiter
2806 fn parse_quote_operator(&mut self, delimiter: char) -> Option<Token> {
2807 let info = self.current_quote_op.as_ref()?;
2808 let start = info.start_pos;
2809 let operator = info.operator.clone();
2810
2811 // Parse based on operator type
2812 match operator.as_str() {
2813 "s" => {
2814 // Substitution: two bodies
2815 let _pattern = self.read_delimited_body(delimiter);
2816
2817 // For paired delimiters, skip whitespace between bodies
2818 if quote_handler::paired_close(delimiter).is_some() {
2819 while let Some(ch) = self.current_char() {
2820 if ch.is_whitespace() {
2821 self.advance();
2822 } else {
2823 break;
2824 }
2825 }
2826 // Expect same delimiter for replacement
2827 if self.current_char() == Some(delimiter) {
2828 self.advance();
2829 }
2830 }
2831
2832 let _replacement = self.read_delimited_body(delimiter);
2833
2834 // Parse modifiers
2835 self.parse_regex_modifiers("e_handler::S_SPEC);
2836 }
2837 "tr" | "y" => {
2838 // Transliteration: two bodies
2839 let _from = self.read_delimited_body(delimiter);
2840
2841 // For paired delimiters, skip whitespace between bodies
2842 if quote_handler::paired_close(delimiter).is_some() {
2843 while let Some(ch) = self.current_char() {
2844 if ch.is_whitespace() {
2845 self.advance();
2846 } else {
2847 break;
2848 }
2849 }
2850 // Expect same delimiter for replacement
2851 if self.current_char() == Some(delimiter) {
2852 self.advance();
2853 }
2854 }
2855
2856 let _to = self.read_delimited_body(delimiter);
2857
2858 // Parse modifiers
2859 self.parse_regex_modifiers("e_handler::TR_SPEC);
2860 }
2861 "qr" => {
2862 let _pattern = self.read_delimited_body(delimiter);
2863 self.parse_regex_modifiers("e_handler::QR_SPEC);
2864 }
2865 "m" => {
2866 let _pattern = self.read_delimited_body(delimiter);
2867 self.parse_regex_modifiers("e_handler::M_SPEC);
2868 }
2869 _ => {
2870 // q, qq, qw, qx - no modifiers
2871 let _body = self.read_delimited_body(delimiter);
2872 }
2873 }
2874
2875 let text = &self.input[start..self.position];
2876 let token_type = quote_handler::get_quote_token_type(&operator);
2877
2878 self.mode = LexerMode::ExpectOperator;
2879 self.current_quote_op = None;
2880
2881 Some(Token { token_type, text: Arc::from(text), start, end: self.position })
2882 }
2883
2884 /// Parse regex modifiers according to the given spec
2885 ///
2886 /// This function includes ALL characters that could be intended as modifiers,
2887 /// including invalid ones. This allows the parser to properly reject invalid
2888 /// modifiers with a clear error message, rather than leaving them as separate
2889 /// tokens that could be confusingly parsed.
2890 fn parse_regex_modifiers(&mut self, _spec: "e_handler::ModSpec) {
2891 // Consume all alphanumeric characters that could be intended as modifiers
2892 // The parser will validate and reject invalid ones
2893 while let Some(ch) = self.current_char() {
2894 if ch.is_ascii_alphanumeric() {
2895 self.advance();
2896 } else {
2897 break;
2898 }
2899 }
2900 // Note: We no longer validate here - the parser will validate and provide
2901 // clear error messages for invalid modifiers (MUT_005 fix)
2902 }
2903
2904 /// Parse a regex literal starting with `/`
2905 ///
2906 /// **Timeout Protection (Issue #422)**:
2907 /// - Budget guard prevents infinite loops on pathological input
2908 /// - MAX_REGEX_BYTES limit (64KB) ensures bounded execution time
2909 /// - Graceful degradation: emit UnknownRest token if budget exceeded
2910 ///
2911 /// **Performance**:
2912 /// - Single-pass scanning with escape handling
2913 /// - Budget check per iteration (amortized O(1) via inline fast path)
2914 /// - Typical regex: <10μs, Large regex (64KB): ~1ms
2915 fn parse_regex(&mut self, start: usize) -> Option<Token> {
2916 self.advance(); // Skip opening /
2917
2918 while let Some(ch) = self.current_char() {
2919 // Budget guard: prevent timeout on pathological input (Issue #422)
2920 // If exceeded, returns UnknownRest token for graceful degradation
2921 if let Some(token) = self.budget_guard(start, 0) {
2922 return Some(token);
2923 }
2924
2925 match ch {
2926 '/' => {
2927 self.advance();
2928 // Parse flags - include all alphanumeric for proper validation in parser (MUT_005 fix)
2929 while let Some(ch) = self.current_char() {
2930 if ch.is_ascii_alphanumeric() {
2931 self.advance();
2932 } else {
2933 break;
2934 }
2935 }
2936
2937 let text = &self.input[start..self.position];
2938 self.mode = LexerMode::ExpectOperator;
2939
2940 return Some(Token {
2941 token_type: TokenType::RegexMatch,
2942 text: Arc::from(text),
2943 start,
2944 end: self.position,
2945 });
2946 }
2947 '\\' => {
2948 // Handle escape sequences: consume backslash + next char
2949 self.advance();
2950 if self.current_char().is_some() {
2951 self.advance();
2952 }
2953 }
2954 _ => self.advance(),
2955 }
2956 }
2957
2958 // Unterminated regex - EOF reached before closing /
2959 // Parser will emit diagnostic for unterminated literal
2960 None
2961 }
2962}
2963
2964// Pre-allocated empty Arc to avoid repeated allocations
2965static EMPTY_ARC: OnceLock<Arc<str>> = OnceLock::new();
2966
2967#[inline(always)]
2968fn empty_arc() -> Arc<str> {
2969 EMPTY_ARC.get_or_init(|| Arc::from("")).clone()
2970}
2971
2972#[inline(always)]
2973fn is_keyword(word: &str) -> bool {
2974 // Fast length-based rejection for most cases.
2975 // Lexer keywords are currently bounded to 1..=9 characters.
2976 matches!(word.len(), 1..=9) && is_lexer_keyword(word)
2977}
2978
2979/// Fast lookup table for compound operator second characters
2980const COMPOUND_SECOND_CHARS: &[u8] = b"=<>&|+->.~*";
2981
2982#[inline]
2983fn is_compound_operator(first: char, second: char) -> bool {
2984 // Optimized compound operator lookup using perfect hashing for common cases
2985 // Convert to bytes for faster comparison (most operators are ASCII)
2986 if first.is_ascii() && second.is_ascii() {
2987 let first_byte = first as u8;
2988 let second_byte = second as u8;
2989
2990 if !COMPOUND_SECOND_CHARS.contains(&second_byte) {
2991 return false;
2992 }
2993
2994 // Use lookup table approach for maximum performance
2995 match (first_byte, second_byte) {
2996 // Assignment operators
2997 (b'+' | b'-' | b'*' | b'/' | b'%' | b'&' | b'|' | b'^' | b'.', b'=') => true,
2998
2999 // Comparison operators
3000 (b'<' | b'>' | b'=' | b'!', b'=') => true,
3001
3002 // Pattern operators
3003 (b'=' | b'!', b'~') => true,
3004
3005 // Increment/decrement
3006 (b'+', b'+') | (b'-', b'-') => true,
3007
3008 // Logical operators
3009 (b'&', b'&') | (b'|', b'|') => true,
3010
3011 // Shift operators
3012 (b'<', b'<') | (b'>', b'>') => true,
3013
3014 // Other compound operators
3015 (b'*', b'*')
3016 | (b'/', b'/')
3017 | (b'-' | b'=', b'>')
3018 | (b'.', b'.')
3019 | (b'~', b'~')
3020 | (b':', b':') => true,
3021
3022 _ => false,
3023 }
3024 } else {
3025 // Fallback for non-ASCII (should be rare)
3026 matches!(
3027 (first, second),
3028 ('+' | '-' | '*' | '/' | '%' | '&' | '|' | '^' | '.' | '<' | '>' | '=' | '!', '=')
3029 | ('=' | '!' | '~', '~')
3030 | ('+', '+')
3031 | ('-', '-' | '>')
3032 | ('&', '&')
3033 | ('|', '|')
3034 | ('<', '<')
3035 | ('>' | '=', '>')
3036 | ('*', '*')
3037 | ('/', '/')
3038 | ('.', '.')
3039 | (':', ':')
3040 )
3041 }
3042}
3043
3044// Checkpoint support for incremental parsing
3045impl Checkpointable for PerlLexer<'_> {
3046 fn checkpoint(&self) -> LexerCheckpoint {
3047 use checkpoint::CheckpointContext;
3048
3049 // Determine the checkpoint context based on current state
3050 let context = if matches!(self.mode, LexerMode::InFormatBody) {
3051 CheckpointContext::Format {
3052 start_position: self.position.saturating_sub(100), // Approximate
3053 }
3054 } else if !self.delimiter_stack.is_empty() {
3055 // We're in some kind of quote-like construct
3056 CheckpointContext::QuoteLike {
3057 operator: String::new(), // Would need to track this
3058 delimiter: self.delimiter_stack.last().copied().unwrap_or('\0'),
3059 is_paired: true,
3060 }
3061 } else {
3062 CheckpointContext::Normal
3063 };
3064
3065 LexerCheckpoint {
3066 position: self.position,
3067 mode: self.mode,
3068 delimiter_stack: self.delimiter_stack.clone(),
3069 in_prototype: self.in_prototype,
3070 prototype_depth: self.prototype_depth,
3071 current_pos: self.current_pos,
3072 context,
3073 }
3074 }
3075
3076 fn restore(&mut self, checkpoint: &LexerCheckpoint) {
3077 self.position = checkpoint.position;
3078 self.mode = checkpoint.mode;
3079 self.delimiter_stack.clone_from(&checkpoint.delimiter_stack);
3080 self.in_prototype = checkpoint.in_prototype;
3081 self.prototype_depth = checkpoint.prototype_depth;
3082 self.current_pos = checkpoint.current_pos;
3083
3084 // Handle special contexts
3085 use checkpoint::CheckpointContext;
3086 if let CheckpointContext::Format { .. } = &checkpoint.context {
3087 // Ensure we're in format body mode
3088 if !matches!(self.mode, LexerMode::InFormatBody) {
3089 self.mode = LexerMode::InFormatBody;
3090 }
3091 }
3092 }
3093
3094 fn can_restore(&self, checkpoint: &LexerCheckpoint) -> bool {
3095 // Can restore if the position is valid for our input
3096 checkpoint.position <= self.input.len()
3097 }
3098}
3099
3100#[cfg(test)]
3101mod test_format_debug;
3102
3103#[cfg(test)]
3104mod tests {
3105 use super::*;
3106
3107 type TestResult = std::result::Result<(), Box<dyn std::error::Error>>;
3108
3109 #[test]
3110 fn test_basic_tokens() -> TestResult {
3111 let mut lexer = PerlLexer::new("my $x = 42;");
3112
3113 let token = lexer.next_token().ok_or("Expected keyword token")?;
3114 assert_eq!(token.token_type, TokenType::Keyword(Arc::from("my")));
3115
3116 let token = lexer.next_token().ok_or("Expected identifier token")?;
3117 assert!(matches!(token.token_type, TokenType::Identifier(_)));
3118
3119 let token = lexer.next_token().ok_or("Expected operator token")?;
3120 assert!(matches!(token.token_type, TokenType::Operator(_)));
3121
3122 let token = lexer.next_token().ok_or("Expected number token")?;
3123 assert!(matches!(token.token_type, TokenType::Number(_)));
3124
3125 let token = lexer.next_token().ok_or("Expected semicolon token")?;
3126 assert_eq!(token.token_type, TokenType::Semicolon);
3127 Ok(())
3128 }
3129
3130 #[test]
3131 fn test_slash_disambiguation() -> TestResult {
3132 // Division
3133 let mut lexer = PerlLexer::new("10 / 2");
3134 lexer.next_token(); // 10
3135 let token = lexer.next_token().ok_or("Expected division token")?;
3136 assert_eq!(token.token_type, TokenType::Division);
3137
3138 // Regex
3139 let mut lexer = PerlLexer::new("if (/pattern/)");
3140 lexer.next_token(); // if
3141 lexer.next_token(); // (
3142 let token = lexer.next_token().ok_or("Expected regex token")?;
3143 assert_eq!(token.token_type, TokenType::RegexMatch);
3144 Ok(())
3145 }
3146
3147 #[test]
3148 fn test_percent_and_double_sigil_disambiguation() -> TestResult {
3149 // Hash variable
3150 let mut lexer = PerlLexer::new("%hash");
3151 let token = lexer.next_token().ok_or("Expected hash identifier token")?;
3152 assert!(
3153 matches!(token.token_type, TokenType::Identifier(ref id) if id.as_ref() == "%hash")
3154 );
3155
3156 // Modulo operator
3157 let mut lexer = PerlLexer::new("10 % 3");
3158 lexer.next_token(); // 10
3159 let token = lexer.next_token().ok_or("Expected modulo operator token")?;
3160 assert!(matches!(token.token_type, TokenType::Operator(ref op) if op.as_ref() == "%"));
3161 Ok(())
3162 }
3163
3164 #[test]
3165 fn test_defined_or_and_exponent() -> TestResult {
3166 // Defined-or operator
3167 let mut lexer = PerlLexer::new("$a // $b");
3168 lexer.next_token(); // $a
3169 let token = lexer.next_token().ok_or("Expected defined-or operator token")?;
3170 assert!(matches!(token.token_type, TokenType::Operator(ref op) if op.as_ref() == "//"));
3171
3172 // Regex after =~ should still parse
3173 let mut lexer = PerlLexer::new("$x =~ //");
3174 lexer.next_token(); // $x
3175 lexer.next_token(); // =~
3176 let token = lexer.next_token().ok_or("Expected regex token")?;
3177 assert_eq!(token.token_type, TokenType::RegexMatch);
3178
3179 // Exponent operator
3180 let mut lexer = PerlLexer::new("2 ** 3");
3181 lexer.next_token(); // 2
3182 let token = lexer.next_token().ok_or("Expected exponent operator token")?;
3183 assert!(matches!(token.token_type, TokenType::Operator(ref op) if op.as_ref() == "**"));
3184 Ok(())
3185 }
3186}