#![warn(clippy::all)]
#![allow(
// Core allows for lexer code
clippy::too_many_lines,
clippy::module_name_repetitions,
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
clippy::cast_possible_wrap,
clippy::cast_precision_loss,
clippy::must_use_candidate,
clippy::missing_errors_doc,
clippy::missing_panics_doc,
// Lexer-specific patterns that are fine
clippy::match_same_arms,
clippy::redundant_else,
clippy::unnecessary_wraps,
clippy::unused_self,
clippy::items_after_statements,
clippy::struct_excessive_bools,
clippy::uninlined_format_args
)]
use perl_keywords::is_lexer_keyword;
use std::sync::{Arc, OnceLock};
pub mod checkpoint;
pub mod error;
pub mod mode;
mod quote_handler;
pub mod token;
mod unicode;
pub use checkpoint::{CheckpointCache, Checkpointable, LexerCheckpoint};
pub use error::{LexerError, Result};
pub use mode::LexerMode;
pub use perl_position_tracking::Position;
pub use token::{StringPart, Token, TokenType};
use unicode::{is_perl_identifier_continue, is_perl_identifier_start};
#[derive(Clone)]
struct HeredocSpec {
label: Arc<str>,
body_start: usize, allow_indent: bool, }
const MAX_REGEX_BYTES: usize = 64 * 1024; const MAX_HEREDOC_BYTES: usize = 256 * 1024; const MAX_DELIM_NEST: usize = 128; const MAX_HEREDOC_DEPTH: usize = 100; const HEREDOC_TIMEOUT_MS: u64 = 5000;
pub const MAX_REGEX_PARSE_STEPS: usize = 32 * 1024;
#[derive(Debug, Clone)]
pub struct LexerConfig {
pub parse_interpolation: bool,
pub track_positions: bool,
pub max_lookahead: usize,
}
impl Default for LexerConfig {
fn default() -> Self {
Self { parse_interpolation: true, track_positions: true, max_lookahead: 1024 }
}
}
pub struct PerlLexer<'a> {
input: &'a str,
input_bytes: &'a [u8],
position: usize,
mode: LexerMode,
config: LexerConfig,
delimiter_stack: Vec<char>,
in_prototype: bool,
prototype_depth: usize,
after_sub: bool,
after_arrow: bool,
hash_brace_depth: usize,
after_var_subscript: bool,
paren_depth: usize,
#[allow(dead_code)]
current_pos: Position,
after_newline: bool,
pending_heredocs: Vec<HeredocSpec>,
line_start_offset: usize,
emit_heredoc_body_tokens: bool,
current_quote_op: Option<quote_handler::QuoteOperatorInfo>,
eof_emitted: bool,
start_time: std::time::Instant,
}
impl<'a> PerlLexer<'a> {
pub fn new(input: &'a str) -> Self {
Self::with_config(input, LexerConfig::default())
}
pub fn with_config(input: &'a str, config: LexerConfig) -> Self {
Self {
input,
input_bytes: input.as_bytes(),
position: 0,
mode: LexerMode::ExpectTerm,
config,
delimiter_stack: Vec::new(),
in_prototype: false,
prototype_depth: 0,
after_sub: false,
after_arrow: false,
hash_brace_depth: 0,
after_var_subscript: false,
paren_depth: 0,
current_pos: Position::start(),
after_newline: true, pending_heredocs: Vec::new(),
line_start_offset: 0,
emit_heredoc_body_tokens: false,
current_quote_op: None,
eof_emitted: false,
start_time: std::time::Instant::now(),
}
}
pub fn with_body_tokens(input: &'a str) -> Self {
let mut lexer = Self::new(input);
lexer.emit_heredoc_body_tokens = true;
lexer
}
fn normalize_file_start(&mut self) {
if self.position == 0 && self.matches_bytes(&[0xEF, 0xBB, 0xBF]) {
self.position = 3;
self.line_start_offset = 3;
}
}
pub fn set_mode(&mut self, mode: LexerMode) {
self.mode = mode;
}
#[inline]
fn trailing_ws_only(bytes: &[u8], mut p: usize) -> bool {
while p < bytes.len() && bytes[p] != b'\n' && bytes[p] != b'\r' {
match bytes[p] {
b' ' | b'\t' => p += 1,
_ => return false,
}
}
true
}
#[inline]
fn consume_newline(&mut self) {
if self.position >= self.input.len() {
return;
}
match self.input_bytes[self.position] {
b'\r' => {
self.position += 1;
if self.position < self.input.len() && self.input_bytes[self.position] == b'\n' {
self.position += 1;
}
}
b'\n' => self.advance(),
_ => return, }
self.after_newline = true;
self.line_start_offset = self.position;
}
#[inline]
fn find_line_end(bytes: &[u8], start: usize) -> (usize, usize) {
let mut end = start;
while end < bytes.len() && bytes[end] != b'\n' && bytes[end] != b'\r' {
end += 1;
}
let visible_end = if end > start && end > 0 && bytes[end.saturating_sub(1)] == b'\r' {
end - 1
} else {
end
};
(end, visible_end)
}
pub fn next_token(&mut self) -> Option<Token> {
if self.position == 0 {
self.normalize_file_start();
}
loop {
if matches!(self.mode, LexerMode::InFormatBody) {
return self.parse_format_body();
}
if matches!(self.mode, LexerMode::InDataSection) {
return self.parse_data_body();
}
let mut found_terminator = false;
if !self.pending_heredocs.is_empty() {
let (body_start, label, allow_indent) =
if let Some(spec) = self.pending_heredocs.first() {
if spec.body_start > 0
&& self.position >= spec.body_start
&& self.position < self.input.len()
{
(spec.body_start, spec.label.clone(), spec.allow_indent)
} else {
(0, empty_arc(), false)
}
} else {
(0, empty_arc(), false)
};
if body_start > 0 {
while self.position < self.input.len() {
if self.start_time.elapsed().as_millis() > HEREDOC_TIMEOUT_MS as u128 {
self.pending_heredocs.remove(0);
self.position = self.input.len();
return Some(Token {
token_type: TokenType::Error(Arc::from("Heredoc parsing timeout")),
text: Arc::from(&self.input[body_start..]),
start: body_start,
end: self.input.len(),
});
}
if self.position - body_start > MAX_HEREDOC_BYTES {
self.pending_heredocs.remove(0);
self.position = self.input.len();
return Some(Token {
token_type: TokenType::UnknownRest,
text: Arc::from(&self.input[body_start..]),
start: body_start,
end: self.input.len(),
});
}
if !self.after_newline && self.position != body_start {
while self.position < self.input.len()
&& self.input_bytes[self.position] != b'\n'
&& self.input_bytes[self.position] != b'\r'
{
self.advance();
}
self.consume_newline();
continue;
}
let line_start = self.position;
let (line_end, line_visible_end) =
Self::find_line_end(self.input_bytes, self.position);
let line = &self.input[line_start..line_visible_end];
let trimmed_end = line.trim_end_matches([' ', '\t']);
let is_terminator = if allow_indent {
let mut p = 0;
while p < trimmed_end.len() {
let b = trimmed_end.as_bytes()[p];
if b == b' ' || b == b'\t' {
p += 1;
} else {
break;
}
}
trimmed_end[p..] == *label
} else {
trimmed_end == &*label
};
if is_terminator {
self.pending_heredocs.remove(0);
found_terminator = true;
self.position = line_end;
self.consume_newline();
if let Some(next) = self.pending_heredocs.first_mut()
&& next.body_start == 0
{
next.body_start = self.position;
}
if self.emit_heredoc_body_tokens {
return Some(Token {
token_type: TokenType::HeredocBody(empty_arc()),
text: empty_arc(),
start: body_start,
end: line_start,
});
}
break; }
self.position = line_end;
self.consume_newline();
}
if !found_terminator {
self.pending_heredocs.remove(0);
self.position = self.input.len();
return Some(Token {
token_type: TokenType::UnknownRest,
text: Arc::from(&self.input[body_start..]),
start: body_start,
end: self.input.len(),
});
}
}
if found_terminator {
continue; }
}
self.skip_whitespace_and_comments()?;
if !self.pending_heredocs.is_empty()
&& let Some(spec) = self.pending_heredocs.first()
&& spec.body_start > 0
&& self.position >= spec.body_start
&& self.position < self.input.len()
{
continue; }
if self.position >= self.input.len() && !self.pending_heredocs.is_empty() {
self.pending_heredocs.clear();
}
if self.position >= self.input.len() {
if self.eof_emitted {
return None; }
self.eof_emitted = true;
return Some(Token {
token_type: TokenType::EOF,
text: empty_arc(),
start: self.position,
end: self.position,
});
}
let start = self.position;
if let Some(token) = self.try_heredoc() {
return Some(token);
}
if let Some(token) = self.try_string() {
return Some(token);
}
if let Some(token) = self.try_variable() {
return Some(token);
}
if let Some(token) = self.try_number() {
return Some(token);
}
if let Some(token) = self.try_vstring() {
return Some(token);
}
if let Some(token) = self.try_identifier_or_keyword() {
return Some(token);
}
if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
if let Some(token) = self.try_delimiter() {
return Some(token);
}
self.mode = LexerMode::ExpectOperator;
self.current_quote_op = None;
continue;
}
if let Some(token) = self.try_operator() {
return Some(token);
}
if let Some(token) = self.try_delimiter() {
return Some(token);
}
let ch = self.current_char()?;
self.advance();
let text = if ch.is_ascii() {
Arc::from(&self.input[start..self.position])
} else {
Arc::from(ch.to_string())
};
return Some(Token {
token_type: TokenType::Error(Arc::from("Unexpected character")),
text,
start,
end: self.position,
});
} }
#[allow(clippy::inline_always)] #[inline(always)]
fn budget_guard(&mut self, start: usize, depth: usize) -> Option<Token> {
let bytes_consumed = self.position - start;
if bytes_consumed <= MAX_REGEX_BYTES && depth <= MAX_DELIM_NEST {
return None;
}
#[cfg(debug_assertions)]
{
tracing::debug!(
bytes_consumed,
depth,
position = self.position,
"Lexer budget exceeded"
);
}
self.position = self.input.len();
Some(Token {
token_type: TokenType::UnknownRest,
text: Arc::from(""),
start,
end: self.position,
})
}
pub fn peek_token(&mut self) -> Option<Token> {
let saved_pos = self.position;
let saved_mode = self.mode;
let saved_delimiter_stack = self.delimiter_stack.clone();
let saved_prototype = self.in_prototype;
let saved_depth = self.prototype_depth;
let saved_after_sub = self.after_sub;
let saved_after_arrow = self.after_arrow;
let saved_hash_brace_depth = self.hash_brace_depth;
let saved_after_var_subscript = self.after_var_subscript;
let saved_paren_depth = self.paren_depth;
let saved_current_pos = self.current_pos;
let saved_after_newline = self.after_newline;
let saved_pending_heredocs = self.pending_heredocs.clone();
let saved_line_start_offset = self.line_start_offset;
let saved_current_quote_op = self.current_quote_op.clone();
let saved_eof_emitted = self.eof_emitted;
let saved_start_time = self.start_time;
let token = self.next_token();
self.position = saved_pos;
self.mode = saved_mode;
self.delimiter_stack = saved_delimiter_stack;
self.in_prototype = saved_prototype;
self.prototype_depth = saved_depth;
self.after_sub = saved_after_sub;
self.after_arrow = saved_after_arrow;
self.hash_brace_depth = saved_hash_brace_depth;
self.after_var_subscript = saved_after_var_subscript;
self.paren_depth = saved_paren_depth;
self.current_pos = saved_current_pos;
self.after_newline = saved_after_newline;
self.pending_heredocs = saved_pending_heredocs;
self.line_start_offset = saved_line_start_offset;
self.current_quote_op = saved_current_quote_op;
self.eof_emitted = saved_eof_emitted;
self.start_time = saved_start_time;
token
}
pub fn collect_tokens(&mut self) -> Vec<Token> {
let mut tokens = Vec::new();
while let Some(token) = self.next_token() {
if token.token_type == TokenType::EOF {
tokens.push(token);
break;
}
tokens.push(token);
}
tokens
}
pub fn reset(&mut self) {
self.position = 0;
self.mode = LexerMode::ExpectTerm;
self.delimiter_stack.clear();
self.in_prototype = false;
self.prototype_depth = 0;
self.after_sub = false;
self.after_arrow = false;
self.hash_brace_depth = 0;
self.after_var_subscript = false;
self.paren_depth = 0;
self.current_pos = Position::start();
self.after_newline = true;
self.pending_heredocs.clear();
self.line_start_offset = 0;
self.current_quote_op = None;
self.eof_emitted = false;
self.start_time = std::time::Instant::now();
}
pub fn enter_format_mode(&mut self) {
self.mode = LexerMode::InFormatBody;
}
#[allow(clippy::inline_always)] #[inline(always)]
fn byte_at(bytes: &[u8], index: usize) -> u8 {
debug_assert!(index < bytes.len());
match bytes.get(index) {
Some(&byte) => byte,
None => 0,
}
}
#[allow(clippy::inline_always)] #[inline(always)]
fn current_char(&self) -> Option<char> {
if self.position < self.input_bytes.len() {
let byte = Self::byte_at(self.input_bytes, self.position);
if byte < 128 {
Some(byte as char)
} else {
self.input.get(self.position..).and_then(|s| s.chars().next())
}
} else {
None
}
}
#[inline(always)]
fn peek_char(&self, offset: usize) -> Option<char> {
if offset > self.config.max_lookahead {
return None;
}
let pos = self.position.checked_add(offset)?;
if pos < self.input_bytes.len() {
let byte = Self::byte_at(self.input_bytes, pos);
if byte < 128 {
Some(byte as char)
} else {
self.input.get(self.position..).and_then(|s| s.chars().nth(offset))
}
} else {
None
}
}
#[allow(clippy::inline_always)] #[inline(always)]
fn advance(&mut self) {
if self.position < self.input_bytes.len() {
let byte = Self::byte_at(self.input_bytes, self.position);
if byte < 128 {
self.position += 1;
} else if let Some(ch) = self.input.get(self.position..).and_then(|s| s.chars().next())
{
self.position += ch.len_utf8();
}
}
}
#[inline]
fn peek_byte(&self, offset: usize) -> Option<u8> {
if offset > self.config.max_lookahead {
return None;
}
let pos = self.position.checked_add(offset)?;
if pos < self.input_bytes.len() { Some(self.input_bytes[pos]) } else { None }
}
#[inline]
fn matches_bytes(&self, pattern: &[u8]) -> bool {
let Some(end_offset) = pattern.len().checked_sub(1) else {
return true;
};
if end_offset > self.config.max_lookahead {
return false;
}
let Some(end) = self.position.checked_add(pattern.len()) else {
return false;
};
if end <= self.input_bytes.len() {
&self.input_bytes[self.position..end] == pattern
} else {
false
}
}
#[inline]
fn skip_whitespace_and_comments(&mut self) -> Option<()> {
if self.position > 0 && self.position != self.line_start_offset {
self.after_newline = false;
}
while self.position < self.input_bytes.len() {
let byte = Self::byte_at(self.input_bytes, self.position);
match byte {
b' ' => {
let start = self.position;
while self.position < self.input_bytes.len()
&& Self::byte_at(self.input_bytes, self.position) == b' '
{
self.position += 1;
}
if self.position > start {
}
}
b'\t' => {
let start = self.position;
while self.position < self.input_bytes.len()
&& Self::byte_at(self.input_bytes, self.position) == b'\t'
{
self.position += 1;
}
if self.position > start {
}
}
b'\r' | b'\n' => {
self.consume_newline();
if !self.pending_heredocs.is_empty() {
for spec in &mut self.pending_heredocs {
if spec.body_start == 0 {
spec.body_start = self.position;
break; }
}
}
}
b'#' => {
if matches!(self.mode, LexerMode::ExpectDelimiter) {
break;
}
self.position += 1;
if let Some(newline_offset) =
memchr::memchr(b'\n', &self.input_bytes[self.position..])
{
self.position += newline_offset;
} else {
self.position = self.input_bytes.len();
}
}
b'=' if self.position == 0
|| (self.position > 0 && self.input_bytes[self.position - 1] == b'\n') =>
{
let remaining = &self.input_bytes[self.position..];
if remaining.starts_with(b"=pod")
|| remaining.starts_with(b"=head")
|| remaining.starts_with(b"=over")
|| remaining.starts_with(b"=item")
|| remaining.starts_with(b"=back")
|| remaining.starts_with(b"=begin")
|| remaining.starts_with(b"=end")
|| remaining.starts_with(b"=for")
|| remaining.starts_with(b"=encoding")
{
let search_start = self.position;
let mut found_cut = false;
let bytes = self.input_bytes;
let mut i = search_start;
while i < bytes.len() {
if (i == 0 || bytes[i - 1] == b'\n') && bytes[i..].starts_with(b"=cut")
{
i += 4; while i < bytes.len() && bytes[i] != b'\n' {
i += 1;
}
if i < bytes.len() && bytes[i] == b'\n' {
i += 1;
}
self.position = i;
found_cut = true;
break;
}
i += 1;
}
if !found_cut {
self.position = bytes.len();
}
continue;
}
break;
}
_ => {
if byte >= 128
&& let Some(ch) = self.current_char()
&& ch.is_whitespace()
{
self.advance();
continue;
}
break;
}
}
}
Some(())
}
fn try_heredoc(&mut self) -> Option<Token> {
if self.mode == LexerMode::ExpectOperator && self.paren_depth > 0 {
return None;
}
if self.peek_byte(0) != Some(b'<') || self.peek_byte(1) != Some(b'<') {
return None;
}
let start = self.position;
let mut text = String::from("<<");
self.position += 2;
let allow_indent = if self.current_char() == Some('~') {
text.push('~');
self.advance();
true
} else {
false
};
while let Some(ch) = self.current_char() {
if ch == ' ' || ch == '\t' {
text.push(ch);
self.advance();
} else {
break;
}
}
let backslashed = if self.current_char() == Some('\\') {
text.push('\\');
self.advance();
true
} else {
false
};
let delimiter = if self.position < self.input.len() {
match self.current_char() {
Some('"') if !backslashed => {
text.push('"');
self.advance();
let mut delim = String::new();
while self.position < self.input.len() {
if let Some(ch) = self.current_char() {
if ch == '"' {
text.push('"');
self.advance();
break;
}
delim.push(ch);
text.push(ch);
self.advance();
} else {
break;
}
}
delim
}
Some('\'') if !backslashed => {
text.push('\'');
self.advance();
let mut delim = String::new();
while self.position < self.input.len() {
if let Some(ch) = self.current_char() {
if ch == '\'' {
text.push('\'');
self.advance();
break;
}
delim.push(ch);
text.push(ch);
self.advance();
} else {
break;
}
}
delim
}
Some('`') if !backslashed => {
text.push('`');
self.advance();
let mut delim = String::new();
while self.position < self.input.len() {
if let Some(ch) = self.current_char() {
if ch == '`' {
text.push('`');
self.advance();
break;
}
delim.push(ch);
text.push(ch);
self.advance();
} else {
break;
}
}
delim
}
Some(c) if is_perl_identifier_start(c) => {
let mut delim = String::new();
while self.position < self.input.len() {
if let Some(c) = self.current_char() {
if is_perl_identifier_continue(c) {
delim.push(c);
text.push(c);
self.advance();
} else {
break;
}
} else {
break;
}
}
delim
}
_ => {
self.position = start;
return None;
}
}
} else {
self.position = start;
return None;
};
self.mode = LexerMode::ExpectOperator;
if self.pending_heredocs.len() >= MAX_HEREDOC_DEPTH {
return Some(Token {
token_type: TokenType::Error(Arc::from("Heredoc nesting too deep")),
text: Arc::from(text),
start,
end: self.position,
});
}
self.pending_heredocs.push(HeredocSpec {
label: Arc::from(delimiter.as_str()),
body_start: 0, allow_indent,
});
Some(Token {
token_type: TokenType::HeredocStart,
text: Arc::from(text),
start,
end: self.position,
})
}
fn try_string(&mut self) -> Option<Token> {
let start = self.position;
let quote = self.current_char()?;
match quote {
'"' => self.parse_double_quoted_string(start),
'\'' => self.parse_single_quoted_string(start),
'`' => self.parse_backtick_string(start),
'q' if self.peek_char(1) == Some('{') => self.parse_q_string(start),
_ => None,
}
}
#[inline]
fn try_number(&mut self) -> Option<Token> {
let start = self.position;
let bytes = self.input_bytes;
if self.position >= bytes.len() || !Self::byte_at(bytes, self.position).is_ascii_digit() {
return None;
}
let mut pos = self.position;
if Self::byte_at(bytes, pos) == b'0' && pos + 1 < bytes.len() {
let prefix_byte = bytes[pos + 1];
if prefix_byte == b'x' || prefix_byte == b'X' {
pos += 2; let digit_start = pos;
while pos < bytes.len() && (bytes[pos].is_ascii_hexdigit() || bytes[pos] == b'_') {
pos += 1;
}
if pos > digit_start {
self.position = pos;
let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectOperator;
return Some(Token {
token_type: TokenType::Number(Arc::from(text)),
text: Arc::from(text),
start,
end: self.position,
});
}
} else if prefix_byte == b'b' || prefix_byte == b'B' {
pos += 2; let digit_start = pos;
while pos < bytes.len()
&& (bytes[pos] == b'0' || bytes[pos] == b'1' || bytes[pos] == b'_')
{
pos += 1;
}
if pos > digit_start {
self.position = pos;
let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectOperator;
return Some(Token {
token_type: TokenType::Number(Arc::from(text)),
text: Arc::from(text),
start,
end: self.position,
});
}
} else if prefix_byte == b'o' || prefix_byte == b'O' {
pos += 2; let digit_start = pos;
while pos < bytes.len()
&& ((bytes[pos] >= b'0' && bytes[pos] <= b'7') || bytes[pos] == b'_')
{
pos += 1;
}
if pos > digit_start {
self.position = pos;
let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectOperator;
return Some(Token {
token_type: TokenType::Number(Arc::from(text)),
text: Arc::from(text),
start,
end: self.position,
});
}
}
}
pos = self.position;
while pos < bytes.len() {
let byte = Self::byte_at(bytes, pos);
if byte.is_ascii_digit() || byte == b'_' {
pos += 1;
} else {
break;
}
}
self.position = pos;
if pos < bytes.len() && Self::byte_at(bytes, pos) == b'.' {
let has_following_digit = pos + 1 < bytes.len() && bytes[pos + 1].is_ascii_digit();
let should_consume_dot = has_following_digit || {
pos + 1 >= bytes.len() || {
let next_byte = bytes[pos + 1];
next_byte <= b' '
|| matches!(
next_byte,
b';' | b','
| b')'
| b'}'
| b']'
| b'+'
| b'-'
| b'*'
| b'/'
| b'%'
| b'='
| b'<'
| b'>'
| b'!'
| b'&'
| b'|'
| b'^'
| b'~'
| b'e'
| b'E'
)
}
};
if should_consume_dot {
pos += 1; while pos < bytes.len() && (bytes[pos].is_ascii_digit() || bytes[pos] == b'_') {
pos += 1;
}
self.position = pos;
}
}
if pos < bytes.len() && (bytes[pos] == b'e' || bytes[pos] == b'E') {
let exp_start = pos;
pos += 1;
if pos < bytes.len() && (bytes[pos] == b'+' || bytes[pos] == b'-') {
pos += 1;
}
let mut saw_digit = false;
while pos < bytes.len() {
let byte = bytes[pos];
if byte.is_ascii_digit() {
saw_digit = true;
pos += 1;
} else if byte == b'_' {
pos += 1;
} else {
break;
}
}
if !saw_digit {
pos = exp_start;
}
self.position = pos;
}
let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectOperator;
Some(Token {
token_type: TokenType::Number(Arc::from(text)),
text: Arc::from(text),
start,
end: self.position,
})
}
fn parse_decimal_number(&mut self, start: usize) -> Option<Token> {
self.advance();
while self.position < self.input_bytes.len() {
let byte = self.input_bytes[self.position];
match byte {
b'0'..=b'9' | b'_' => self.position += 1,
b'e' | b'E' => {
self.advance();
if self.position < self.input_bytes.len() {
let next = self.input_bytes[self.position];
if next == b'+' || next == b'-' {
self.advance();
}
}
let exponent_start = self.position;
let mut saw_digit = false;
while self.position < self.input_bytes.len() {
let byte = self.input_bytes[self.position];
if byte.is_ascii_digit() {
saw_digit = true;
self.position += 1;
} else if byte == b'_' {
self.position += 1;
} else {
break;
}
}
if !saw_digit {
self.position = exponent_start.saturating_sub(1);
}
break;
}
_ => break,
}
}
let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectOperator;
Some(Token {
token_type: TokenType::Number(Arc::from(text)),
text: Arc::from(text),
start,
end: self.position,
})
}
fn try_variable(&mut self) -> Option<Token> {
let start = self.position;
let sigil = self.current_char()?;
match sigil {
'$' | '@' | '%' | '*' => {
if self.mode == LexerMode::ExpectOperator && matches!(sigil, '*' | '%') {
return None;
}
self.advance();
let check_arrow = self.position >= 3
&& self.position.saturating_sub(1) <= self.input.len()
&& self.input.is_char_boundary(self.position.saturating_sub(3))
&& self.input.is_char_boundary(self.position.saturating_sub(1));
if check_arrow
&& {
let saved = self.position;
self.position -= 3;
let arrow = self.matches_bytes(b"->");
self.position = saved;
arrow
}
&& matches!(self.current_char(), Some('{' | '[' | '*'))
{
let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectOperator;
return Some(Token {
token_type: TokenType::Identifier(Arc::from(text)),
text: Arc::from(text),
start,
end: self.position,
});
}
if sigil == '$' && self.current_char() == Some('#') {
self.advance(); while let Some(ch) = self.current_char() {
if is_perl_identifier_continue(ch) {
self.advance();
} else if ch == ':' && self.peek_char(1) == Some(':') {
self.advance();
self.advance();
} else {
break;
}
}
let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectOperator;
self.after_var_subscript = true;
return Some(Token {
token_type: TokenType::Identifier(Arc::from(text)),
text: Arc::from(text),
start,
end: self.position,
});
}
if self.current_char() == Some('{') {
let next_char = self.peek_char(1);
let is_deref = sigil != '*'
&& (matches!(
next_char,
Some('$' | '@' | '%' | '*' | '&' | '[' | ' ' | '\t' | '\n' | '\r',)
) || (matches!(sigil, '@' | '%')
&& next_char.is_some_and(is_perl_identifier_start)));
if is_deref {
let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectOperator;
return Some(Token {
token_type: TokenType::Identifier(Arc::from(text)),
text: Arc::from(text),
start,
end: self.position,
});
}
self.advance();
if self.current_char() == Some('^') {
self.advance(); while let Some(ch) = self.current_char() {
if ch == '}' {
self.advance(); break;
} else if is_perl_identifier_continue(ch) {
self.advance();
} else {
break;
}
}
}
else if self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
self.advance(); self.advance(); if self.current_char() == Some('{') {
self.advance();
}
while let Some(ch) = self.current_char() {
if ch == '}' {
self.advance();
if self.current_char() == Some('}') {
self.advance(); }
break;
} else if is_perl_identifier_continue(ch) {
self.advance();
} else {
break;
}
}
}
else {
if sigil != '*'
&& (matches!(
self.current_char(),
Some(
'$' | '@'
| '%'
| '*'
| '&'
| '['
| ' '
| '\t'
| '\n'
| '\r'
| '}'
)
) || self.current_char().is_none())
{
self.position = start + 1; let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectOperator;
return Some(Token {
token_type: TokenType::Identifier(Arc::from(text)),
text: Arc::from(text),
start,
end: self.position,
});
}
if sigil == '*' {
let mut brace_depth: usize = 1;
while let Some(ch) = self.current_char() {
if ch == '{' {
brace_depth += 1;
} else if ch == '}' {
brace_depth = brace_depth.saturating_sub(1);
if brace_depth == 0 {
self.advance(); break;
}
}
self.advance();
}
} else {
while let Some(ch) = self.current_char() {
if ch == '}' {
self.advance(); break;
} else if is_perl_identifier_continue(ch) {
self.advance();
} else {
break;
}
}
}
}
}
else if let Some(ch) = self.current_char() {
if is_perl_identifier_start(ch) {
while let Some(ch) = self.current_char() {
if is_perl_identifier_continue(ch) {
self.advance();
} else {
break;
}
}
while self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
self.advance();
self.advance();
while let Some(ch) = self.current_char() {
if is_perl_identifier_continue(ch) {
self.advance();
} else {
break;
}
}
}
}
else if sigil == '$' && ch == '^' && !self.in_prototype {
self.advance(); if let Some(letter) = self.current_char()
&& letter.is_ascii_uppercase()
{
self.advance();
}
}
else if sigil == '$'
&& !self.in_prototype
&& matches!(
ch,
'?' | '!'
| '@'
| '&'
| '`'
| '\''
| '.'
| '/'
| '\\'
| '|'
| '+'
| '-'
| '['
| ']'
| '$'
| '~'
| '='
| '%'
| ','
| '"'
| ';'
| '>'
| '<'
| ')'
| '(' )
{
self.advance(); }
else if sigil == '$' && ch == '$' {
if !self.peek_char(1).is_some_and(is_perl_identifier_start) {
self.advance(); }
}
else if (sigil == '@' || sigil == '%') && matches!(ch, '+' | '-') {
self.advance(); }
}
let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectOperator;
self.after_var_subscript = matches!(sigil, '$' | '@' | '%');
Some(Token {
token_type: TokenType::Identifier(Arc::from(text)),
text: Arc::from(text),
start,
end: self.position,
})
}
_ => None,
}
}
fn peek_nonspace_and_following(&self) -> (Option<char>, Option<char>) {
let mut i = self.position;
while i < self.input.len() {
let c = match self.input.get(i..).and_then(|s| s.chars().next()) {
Some(c) => c,
None => return (None, None),
};
if c.is_whitespace() {
i += c.len_utf8();
continue;
}
let j = i + c.len_utf8();
let following = self.input.get(j..).and_then(|s| s.chars().next());
return (Some(c), following);
}
(None, None)
}
fn is_quote_delim(c: char) -> bool {
!c.is_ascii_alphanumeric() && !c.is_whitespace()
}
#[inline]
fn try_vstring(&mut self) -> Option<Token> {
let start = self.position;
let bytes = self.input_bytes;
if start >= bytes.len() || bytes[start] != b'v' {
return None;
}
let next_pos = start + 1;
if next_pos >= bytes.len() || !bytes[next_pos].is_ascii_digit() {
return None;
}
let mut pos = next_pos;
while pos < bytes.len() && bytes[pos].is_ascii_digit() {
pos += 1;
}
while pos < bytes.len() && bytes[pos] == b'.' {
let dot_pos = pos;
pos += 1;
if pos >= bytes.len() || !bytes[pos].is_ascii_digit() {
pos = dot_pos;
break;
}
while pos < bytes.len() && bytes[pos].is_ascii_digit() {
pos += 1;
}
}
if pos < bytes.len() {
let next_byte = bytes[pos];
if next_byte == b'_' || next_byte.is_ascii_alphabetic() {
return None;
}
if next_byte >= 128
&& let Some(ch) = self.input.get(pos..).and_then(|s| s.chars().next())
&& is_perl_identifier_continue(ch)
{
return None;
}
}
let text = &self.input[start..pos];
if !text.contains('.') {
return None;
}
self.position = pos;
self.mode = LexerMode::ExpectOperator;
Some(Token {
token_type: TokenType::Version(Arc::from(text)),
text: Arc::from(text),
start,
end: self.position,
})
}
#[inline]
fn try_identifier_or_keyword(&mut self) -> Option<Token> {
let start = self.position;
let ch = self.current_char()?;
if is_perl_identifier_start(ch) {
if !self.after_arrow && ch == 's' && self.peek_char(1) == Some('\'') {
self.advance(); return self.parse_substitution(start);
} else if !self.after_arrow && ch == 'y' && self.peek_char(1) == Some('\'') {
self.advance(); return self.parse_transliteration(start);
} else if !self.after_arrow
&& ch == 't'
&& self.peek_char(1) == Some('r')
&& self.peek_char(2) == Some('\'')
{
self.advance(); self.advance(); return self.parse_transliteration(start);
}
while let Some(ch) = self.current_char() {
if ch == '\''
&& matches!(
&self.input[start..self.position],
"m" | "q" | "qq" | "qw" | "qx" | "qr"
)
{
break;
}
if is_perl_identifier_continue(ch) {
self.advance();
} else {
break;
}
}
while self.current_char() == Some(':') && self.peek_char(1) == Some(':') {
self.advance();
self.advance();
if let Some(ch) = self.current_char()
&& is_perl_identifier_start(ch)
{
self.advance();
while let Some(ch) = self.current_char() {
if is_perl_identifier_continue(ch) {
self.advance();
} else {
break;
}
}
}
}
let text = &self.input[start..self.position];
let in_code_channel =
!matches!(self.mode, LexerMode::InDataSection | LexerMode::InFormatBody)
&& self.pending_heredocs.is_empty();
let marker = if in_code_channel {
if text == "__DATA__" {
Some("__DATA__")
} else if text == "__END__" {
Some("__END__")
} else {
None
}
} else {
None
};
if let Some(marker_text) = marker {
if self.after_newline {
if Self::trailing_ws_only(self.input_bytes, self.position) {
while self.position < self.input.len()
&& self.input_bytes[self.position] != b'\n'
{
self.advance();
}
if self.position < self.input.len()
&& self.input_bytes[self.position] == b'\n'
{
self.advance();
}
self.mode = LexerMode::InDataSection;
return Some(Token {
token_type: TokenType::DataMarker(Arc::from(marker_text)),
text: Arc::from(marker_text),
start,
end: self.position,
});
}
}
}
#[allow(clippy::collapsible_if)]
if !self.after_arrow && matches!(text, "s" | "tr" | "y") {
if let Some(next) = self.current_char() {
if matches!(
next,
'/' | '|'
| '\''
| '{'
| '['
| '('
| '<'
| '!'
| '#'
| '@'
| '$'
| '%'
| '^'
| '&'
| '*'
| '+'
| '='
| '~'
| '`'
) {
match text {
"s" => {
return self.parse_substitution(start);
}
"tr" | "y" => {
return self.parse_transliteration(start);
}
unexpected => {
return Some(Token {
token_type: TokenType::Error(Arc::from(format!(
"Unexpected substitution operator '{}': expected 's', 'tr', or 'y' at position {}",
unexpected, start
))),
text: Arc::from(unexpected),
start,
end: self.position,
});
}
}
}
}
}
let token_type = if is_keyword(text) {
match text {
"if" | "unless" | "while" | "until" | "for" | "foreach" | "grep" | "map"
| "sort" | "split" => {
self.mode = LexerMode::ExpectTerm;
}
"sub" => {
self.after_sub = true;
}
op if !self.after_arrow
&& self.hash_brace_depth == 0
&& quote_handler::is_quote_operator(op) =>
{
let immediate = self.current_char();
let (candidate, char_after_next, has_whitespace) =
if immediate.is_some_and(|c| c.is_whitespace()) {
let (nc, ca) = self.peek_nonspace_and_following();
(nc, ca, true)
} else {
let following = immediate.and_then(|c| {
let j = self.position + c.len_utf8();
self.input.get(j..).and_then(|s| s.chars().next())
});
(immediate, following, false)
};
if let Some(next) = candidate {
let is_fat_arrow = next == '=' && char_after_next == Some('>');
let is_paired_delim = matches!(next, '{' | '[' | '(' | '<');
let is_quote_char = matches!(next, '\'' | '"') && op != "s";
let is_valid_delim = Self::is_quote_delim(next)
&& !is_fat_arrow
&& (!has_whitespace || is_paired_delim || is_quote_char);
if is_valid_delim {
self.mode = LexerMode::ExpectDelimiter;
self.current_quote_op = Some(quote_handler::QuoteOperatorInfo {
operator: op.to_string(),
delimiter: '\0', start_pos: start,
});
while let Some(ch) = self.current_char() {
if ch.is_whitespace() {
self.advance();
} else {
break;
}
}
#[allow(clippy::collapsible_if)]
if let Some(delim) = self.current_char() {
if !delim.is_alphanumeric() {
self.advance();
if let Some(ref mut info) = self.current_quote_op {
info.delimiter = delim;
}
return self.parse_quote_operator(delim);
}
}
} else {
self.current_quote_op = None;
self.mode = LexerMode::ExpectOperator;
return Some(Token {
token_type: TokenType::Identifier(Arc::from(text)),
start,
end: self.position,
text: Arc::from(text),
});
}
} else {
self.current_quote_op = None;
self.mode = LexerMode::ExpectOperator;
return Some(Token {
token_type: TokenType::Identifier(Arc::from(text)),
start,
end: self.position,
text: Arc::from(text),
});
}
self.current_quote_op = None;
self.mode = LexerMode::ExpectOperator;
return Some(Token {
token_type: TokenType::Identifier(Arc::from(text)),
start,
end: self.position,
text: Arc::from(text),
});
}
"format" => {
}
_ => {}
}
TokenType::Keyword(Arc::from(text))
} else {
if is_builtin_function(text) {
self.mode = LexerMode::ExpectTerm;
} else {
self.mode = LexerMode::ExpectOperator;
}
TokenType::Identifier(Arc::from(text))
};
self.after_arrow = false;
self.after_var_subscript = false;
Some(Token { token_type, text: Arc::from(text), start, end: self.position })
} else {
None
}
}
fn parse_data_body(&mut self) -> Option<Token> {
if self.position >= self.input.len() {
self.mode = LexerMode::ExpectTerm;
return Some(Token {
token_type: TokenType::EOF,
text: Arc::from(""),
start: self.position,
end: self.position,
});
}
let start = self.position;
let body = &self.input[self.position..];
self.position = self.input.len();
self.mode = LexerMode::ExpectTerm;
Some(Token {
token_type: TokenType::DataBody(Arc::from(body)),
text: Arc::from(body),
start,
end: self.position,
})
}
fn parse_format_body(&mut self) -> Option<Token> {
let start = self.position;
let mut body = String::new();
let mut line_start = true;
while self.position < self.input.len() {
if line_start && self.current_char() == Some('.') {
let mut peek_pos = self.position + 1;
let mut found_terminator = true;
while peek_pos < self.input.len() {
match self.input_bytes[peek_pos] {
b' ' | b'\t' | b'\r' => peek_pos += 1,
b'\n' => break,
_ => {
found_terminator = false;
break;
}
}
}
if found_terminator {
self.position = peek_pos;
if self.position < self.input.len() && self.input_bytes[self.position] == b'\n'
{
self.position += 1;
}
self.mode = LexerMode::ExpectTerm;
return Some(Token {
token_type: TokenType::FormatBody(Arc::from(body.clone())),
text: Arc::from(body),
start,
end: self.position,
});
}
}
match self.current_char() {
Some(ch) => {
body.push(ch);
self.advance();
line_start = ch == '\n';
}
None => {
break;
}
}
}
self.mode = LexerMode::ExpectTerm;
Some(Token {
token_type: TokenType::Error(Arc::from("Unterminated format body")),
text: Arc::from(body),
start,
end: self.position,
})
}
fn try_operator(&mut self) -> Option<Token> {
if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
return None;
}
let start = self.position;
let ch = self.current_char()?;
if ch == '/' {
if self.mode == LexerMode::ExpectTerm {
return self.parse_regex(start);
} else {
self.advance();
if self.peek_byte(0) == Some(b'/') {
self.position += 1; if self.peek_byte(0) == Some(b'=') {
self.position += 1; let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectTerm;
return Some(Token {
token_type: TokenType::Operator(Arc::from(text)),
text: Arc::from(text),
start,
end: self.position,
});
} else {
self.mode = LexerMode::ExpectTerm;
return Some(Token {
token_type: TokenType::Operator(Arc::from("//")),
text: Arc::from("//"),
start,
end: self.position,
});
}
} else if self.position < self.input_bytes.len()
&& self.input_bytes[self.position] == b'='
{
self.position += 1; self.mode = LexerMode::ExpectTerm;
return Some(Token {
token_type: TokenType::Operator(Arc::from("/=")),
text: Arc::from("/="),
start,
end: self.position,
});
} else {
self.mode = LexerMode::ExpectTerm;
return Some(Token {
token_type: TokenType::Division,
text: Arc::from("/"),
start,
end: self.position,
});
}
}
}
match ch {
'.' => {
if self.mode != LexerMode::ExpectOperator
&& self.peek_char(1).is_some_and(|c| c.is_ascii_digit())
{
return self.parse_decimal_number(start);
}
self.advance();
#[allow(clippy::collapsible_if)]
if let Some(next) = self.current_char() {
if is_compound_operator(ch, next) {
self.advance();
if self.position < self.input.len() {
let third = self.current_char();
if matches!(
(ch, next, third),
('*', '*', Some('='))
| ('<', '<', Some('='))
| ('>', '>', Some('='))
| ('&', '&', Some('='))
| ('|', '|', Some('='))
| ('/', '/', Some('='))
) {
self.advance(); } else if ch == '<' && next == '=' && third == Some('>') {
self.advance(); } else if ch == '.' && next == '.' && third == Some('.') {
self.advance(); }
}
}
}
}
'+' | '-' | '*' | '%' | '&' | '|' | '^' | '~' | '!' | '=' | '<' | '>' | ':' | '?'
| '\\' => {
self.advance();
#[allow(clippy::collapsible_if)]
if let Some(next) = self.current_char() {
if is_compound_operator(ch, next) {
self.advance();
if self.position < self.input.len() {
let third = self.current_char();
if matches!(
(ch, next, third),
('*', '*', Some('='))
| ('<', '<', Some('='))
| ('>', '>', Some('='))
| ('&', '&', Some('='))
| ('|', '|', Some('='))
| ('/', '/', Some('='))
) {
self.advance(); } else if ch == '<' && next == '=' && third == Some('>') {
self.advance(); }
}
}
}
}
_ => return None,
}
let text = &self.input[start..self.position];
self.after_sub = false;
self.after_arrow = text == "->";
self.after_var_subscript = false;
if (text == "++" || text == "--") && self.mode == LexerMode::ExpectOperator {
} else {
self.mode = LexerMode::ExpectTerm;
}
Some(Token {
token_type: TokenType::Operator(Arc::from(text)),
text: Arc::from(text),
start,
end: self.position,
})
}
fn try_delimiter(&mut self) -> Option<Token> {
let start = self.position;
let ch = self.current_char()?;
if matches!(self.mode, LexerMode::ExpectDelimiter) && self.current_quote_op.is_some() {
if !ch.is_alphanumeric() && !ch.is_whitespace() {
self.advance();
if let Some(ref mut info) = self.current_quote_op {
info.delimiter = ch;
}
return self.parse_quote_operator(ch);
}
}
match ch {
'(' => {
if matches!(self.mode, LexerMode::ExpectDelimiter)
&& self.current_quote_op.is_some()
{
self.advance();
if let Some(ref mut info) = self.current_quote_op {
info.delimiter = ch;
}
return self.parse_quote_operator(ch);
}
self.advance();
if self.after_sub {
self.in_prototype = true;
self.after_sub = false;
self.prototype_depth = 1;
} else if self.in_prototype {
self.prototype_depth += 1;
}
self.paren_depth += 1;
self.after_var_subscript = false;
self.mode = LexerMode::ExpectTerm;
Some(Token {
token_type: TokenType::LeftParen,
text: Arc::from("("),
start,
end: self.position,
})
}
')' => {
self.advance();
if self.in_prototype && self.prototype_depth > 0 {
self.prototype_depth -= 1;
if self.prototype_depth == 0 {
self.in_prototype = false;
}
}
self.after_arrow = false;
self.paren_depth = self.paren_depth.saturating_sub(1);
self.after_var_subscript = false;
self.mode = LexerMode::ExpectOperator;
Some(Token {
token_type: TokenType::RightParen,
text: Arc::from(")"),
start,
end: self.position,
})
}
';' => {
self.advance();
self.after_sub = false;
self.after_arrow = false;
self.after_var_subscript = false;
self.mode = LexerMode::ExpectTerm;
Some(Token {
token_type: TokenType::Semicolon,
text: Arc::from(";"),
start,
end: self.position,
})
}
',' => {
self.advance();
self.after_var_subscript = false;
self.mode = LexerMode::ExpectTerm;
Some(Token {
token_type: TokenType::Comma,
text: Arc::from(","),
start,
end: self.position,
})
}
'[' => {
self.advance();
self.after_var_subscript = false;
self.mode = LexerMode::ExpectTerm;
Some(Token {
token_type: TokenType::LeftBracket,
text: Arc::from("["),
start,
end: self.position,
})
}
']' => {
self.advance();
self.after_var_subscript = true;
self.mode = LexerMode::ExpectOperator;
Some(Token {
token_type: TokenType::RightBracket,
text: Arc::from("]"),
start,
end: self.position,
})
}
'{' => {
self.advance();
self.after_sub = false;
if self.after_var_subscript {
self.hash_brace_depth = self.hash_brace_depth.saturating_add(1);
}
self.after_var_subscript = false;
self.mode = LexerMode::ExpectTerm;
Some(Token {
token_type: TokenType::LeftBrace,
text: Arc::from("{"),
start,
end: self.position,
})
}
'}' => {
self.advance();
self.after_arrow = false;
if self.hash_brace_depth > 0 {
self.hash_brace_depth -= 1;
self.after_var_subscript = true;
} else {
self.after_var_subscript = false;
}
self.mode = LexerMode::ExpectOperator;
Some(Token {
token_type: TokenType::RightBrace,
text: Arc::from("}"),
start,
end: self.position,
})
}
'#' => {
if matches!(self.mode, LexerMode::ExpectDelimiter) {
self.advance();
self.mode = LexerMode::ExpectTerm;
Some(Token {
token_type: TokenType::Operator(Arc::from("#")),
text: Arc::from("#"),
start,
end: self.position,
})
} else {
None
}
}
_ => None,
}
}
fn parse_double_quoted_string(&mut self, start: usize) -> Option<Token> {
self.advance(); let mut parts = Vec::new();
let mut current_literal = String::new();
let mut last_pos = self.position;
while let Some(ch) = self.current_char() {
match ch {
'"' => {
self.advance();
if !current_literal.is_empty() {
parts.push(StringPart::Literal(Arc::from(current_literal)));
}
let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectOperator;
return Some(Token {
token_type: if parts.is_empty() {
TokenType::StringLiteral
} else {
TokenType::InterpolatedString(parts)
},
text: Arc::from(text),
start,
end: self.position,
});
}
'\\' => {
self.advance();
if let Some(escaped) = self.current_char() {
if current_literal.capacity() == 0 {
current_literal.reserve(32);
}
current_literal.push('\\');
current_literal.push(escaped);
self.advance();
}
}
'$' if self.config.parse_interpolation => {
if !current_literal.is_empty() {
parts.push(StringPart::Literal(Arc::from(current_literal)));
current_literal = String::new(); }
self.advance();
let var_start = self.position;
while self.position < self.input_bytes.len() {
let byte = self.input_bytes[self.position];
if byte.is_ascii_alphanumeric() || byte == b'_' {
self.position += 1;
} else if byte >= 128 {
if let Some(ch) = self.current_char() {
if is_perl_identifier_continue(ch) {
self.advance();
} else {
break;
}
} else {
break;
}
} else {
break;
}
}
if self.position > var_start {
let var_name = &self.input[var_start - 1..self.position];
parts.push(StringPart::Variable(Arc::from(var_name)));
}
}
_ => {
if current_literal.capacity() == 0 {
current_literal.reserve(32);
}
current_literal.push(ch);
self.advance();
}
}
if self.position == last_pos {
break;
}
last_pos = self.position;
}
let end = self.input.len();
self.position = end;
Some(Token {
token_type: TokenType::Error(Arc::from("unterminated string")),
text: Arc::from(&self.input[start..end]),
start,
end,
})
}
fn parse_single_quoted_string(&mut self, start: usize) -> Option<Token> {
self.advance();
let mut last_pos = self.position;
while let Some(ch) = self.current_char() {
match ch {
'\'' => {
self.advance();
let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectOperator;
return Some(Token {
token_type: TokenType::StringLiteral,
text: Arc::from(text),
start,
end: self.position,
});
}
'\\' => {
self.advance();
if self.current_char() == Some('\'') || self.current_char() == Some('\\') {
self.advance();
}
}
_ => self.advance(),
}
if self.position == last_pos {
break;
}
last_pos = self.position;
}
let end = self.input.len();
self.position = end;
Some(Token {
token_type: TokenType::Error(Arc::from("unterminated string")),
text: Arc::from(&self.input[start..end]),
start,
end,
})
}
fn parse_backtick_string(&mut self, start: usize) -> Option<Token> {
self.advance();
let mut last_pos = self.position;
while let Some(ch) = self.current_char() {
match ch {
'`' => {
self.advance();
let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectOperator;
return Some(Token {
token_type: TokenType::QuoteCommand,
text: Arc::from(text),
start,
end: self.position,
});
}
'\\' => {
self.advance();
if self.current_char().is_some() {
self.advance();
}
}
_ => self.advance(),
}
if self.position == last_pos {
break;
}
last_pos = self.position;
}
let end = self.input.len();
self.position = end;
Some(Token {
token_type: TokenType::Error(Arc::from("unterminated string")),
text: Arc::from(&self.input[start..end]),
start,
end,
})
}
fn parse_q_string(&mut self, _start: usize) -> Option<Token> {
None
}
fn paired_closing(delim: char) -> char {
match delim {
'{' => '}',
'[' => ']',
'(' => ')',
'<' => '>',
_ => delim, }
}
fn repl_inner_string_lookahead(input: &str, pos: usize, quote: char, closing: char) -> bool {
let input_bytes = input.as_bytes();
let mut p = pos + quote.len_utf8(); let mut escaped = false;
let mut content_has_closing = false;
while p < input_bytes.len() {
let byte = input_bytes[p];
if escaped {
escaped = false;
p += 1;
continue;
}
if byte == b'\\' {
escaped = true;
p += 1;
continue;
}
if byte == b'\n' {
return false;
}
let ch = if byte < 128 {
byte as char
} else {
match input.get(p..).and_then(|s| s.chars().next()) {
Some(c) => c,
None => break,
}
};
if ch == closing {
content_has_closing = true;
}
if ch == quote {
return content_has_closing;
}
p += ch.len_utf8();
}
false
}
fn parse_substitution(&mut self, start: usize) -> Option<Token> {
let delimiter = self.current_char()?;
self.advance();
let mut depth = 1;
let is_paired = matches!(delimiter, '{' | '[' | '(' | '<');
let closing = Self::paired_closing(delimiter);
while let Some(ch) = self.current_char() {
if let Some(token) = self.budget_guard(start, depth) {
return Some(token);
}
match ch {
'\\' => {
self.advance();
if self.current_char().is_some() {
self.advance();
}
}
_ if ch == delimiter && is_paired => {
depth += 1;
self.advance();
}
_ if ch == closing => {
self.advance();
if is_paired {
depth = depth.saturating_sub(1);
if depth == 0 {
break;
}
} else {
break;
}
}
_ => self.advance(),
}
}
let (repl_delimiter, repl_closing, repl_is_paired) = if is_paired {
while let Some(ch) = self.current_char() {
if ch.is_whitespace() {
self.advance();
} else {
break;
}
}
if let Some(repl_delim) = self.current_char() {
if matches!(repl_delim, '{' | '[' | '(' | '<') {
let repl_close = Self::paired_closing(repl_delim);
self.advance();
(repl_delim, repl_close, true)
} else {
self.advance();
(repl_delim, repl_delim, false)
}
} else {
(delimiter, closing, is_paired)
}
} else {
(delimiter, closing, false)
};
let mut repl_depth: usize = 1;
while let Some(ch) = self.current_char() {
match ch {
'\\' => {
self.advance();
if self.current_char().is_some() {
self.advance();
}
}
'"' | '\''
if ch != repl_closing
&& Self::repl_inner_string_lookahead(
self.input,
self.position,
ch,
repl_closing,
) =>
{
let quote = ch;
self.advance(); while let Some(inner) = self.current_char() {
if inner == '\\' {
self.advance();
if self.current_char().is_some() {
self.advance();
}
} else if inner == quote {
self.advance(); break;
} else {
self.advance();
}
}
}
_ if ch == repl_delimiter && repl_is_paired => {
repl_depth += 1;
self.advance();
}
_ if ch == repl_closing => {
self.advance();
if repl_is_paired {
repl_depth = repl_depth.saturating_sub(1);
if repl_depth == 0 {
break;
}
} else {
break;
}
}
_ => self.advance(),
}
}
while let Some(ch) = self.current_char() {
if ch.is_ascii_alphanumeric() {
self.advance();
} else {
break;
}
}
let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectOperator;
Some(Token {
token_type: TokenType::Substitution,
text: Arc::from(text),
start,
end: self.position,
})
}
fn parse_transliteration(&mut self, start: usize) -> Option<Token> {
let delimiter = self.current_char()?;
self.advance();
let mut depth = 1;
let is_paired = matches!(delimiter, '{' | '[' | '(' | '<');
let closing = Self::paired_closing(delimiter);
while let Some(ch) = self.current_char() {
if let Some(token) = self.budget_guard(start, depth) {
return Some(token);
}
match ch {
'\\' => {
self.advance();
if self.current_char().is_some() {
self.advance();
}
}
_ if ch == delimiter && is_paired => {
depth += 1;
self.advance();
}
_ if ch == closing => {
self.advance();
if is_paired {
depth = depth.saturating_sub(1);
if depth == 0 {
break;
}
} else {
break;
}
}
_ => self.advance(),
}
}
if is_paired {
while let Some(ch) = self.current_char() {
if ch.is_whitespace() {
self.advance();
} else {
break;
}
}
if self.current_char() == Some(delimiter) {
self.advance();
depth = 1;
}
}
while let Some(ch) = self.current_char() {
match ch {
'\\' => {
self.advance();
if self.current_char().is_some() {
self.advance();
}
}
_ if ch == delimiter && is_paired => {
depth += 1;
self.advance();
}
_ if ch == closing => {
self.advance();
if is_paired {
depth = depth.saturating_sub(1);
if depth == 0 {
break;
}
} else {
break;
}
}
_ => self.advance(),
}
}
while let Some(ch) = self.current_char() {
if ch.is_ascii_alphanumeric() {
self.advance();
} else {
break;
}
}
let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectOperator;
Some(Token {
token_type: TokenType::Transliteration,
text: Arc::from(text),
start,
end: self.position,
})
}
fn read_delimited_body(&mut self, delim: char) -> String {
let paired = quote_handler::paired_close(delim);
let close = paired.unwrap_or(delim);
let mut body = String::new();
let mut depth = i32::from(paired.is_some());
while let Some(ch) = self.current_char() {
if ch == '\\' {
body.push(ch);
self.advance();
if let Some(next) = self.current_char() {
body.push(next);
self.advance();
}
continue;
}
if paired.is_some() && ch == delim {
body.push(ch);
self.advance();
depth += 1;
continue;
}
if ch == close {
if paired.is_some() {
depth -= 1;
if depth == 0 {
self.advance();
break;
}
body.push(ch);
self.advance();
} else {
self.advance();
break;
}
continue;
}
body.push(ch);
self.advance();
}
body
}
fn parse_quote_operator(&mut self, delimiter: char) -> Option<Token> {
let info = self.current_quote_op.as_ref()?;
let start = info.start_pos;
let operator = info.operator.clone();
match operator.as_str() {
"s" => {
let _pattern = self.read_delimited_body(delimiter);
if quote_handler::paired_close(delimiter).is_some() {
while let Some(ch) = self.current_char() {
if ch.is_whitespace() {
self.advance();
} else {
break;
}
}
if self.current_char() == Some(delimiter) {
self.advance();
}
}
let _replacement = self.read_delimited_body(delimiter);
self.parse_regex_modifiers("e_handler::S_SPEC);
}
"tr" | "y" => {
let _from = self.read_delimited_body(delimiter);
if quote_handler::paired_close(delimiter).is_some() {
while let Some(ch) = self.current_char() {
if ch.is_whitespace() {
self.advance();
} else {
break;
}
}
if self.current_char() == Some(delimiter) {
self.advance();
}
}
let _to = self.read_delimited_body(delimiter);
self.parse_regex_modifiers("e_handler::TR_SPEC);
}
"qr" => {
let _pattern = self.read_delimited_body(delimiter);
self.parse_regex_modifiers("e_handler::QR_SPEC);
}
"m" => {
let _pattern = self.read_delimited_body(delimiter);
self.parse_regex_modifiers("e_handler::M_SPEC);
}
_ => {
let _body = self.read_delimited_body(delimiter);
}
}
let text = &self.input[start..self.position];
let token_type = quote_handler::get_quote_token_type(&operator);
self.mode = LexerMode::ExpectOperator;
self.current_quote_op = None;
Some(Token { token_type, text: Arc::from(text), start, end: self.position })
}
fn parse_regex_modifiers(&mut self, _spec: "e_handler::ModSpec) {
while let Some(ch) = self.current_char() {
if ch.is_ascii_alphanumeric() {
self.advance();
} else {
break;
}
}
}
fn parse_regex(&mut self, start: usize) -> Option<Token> {
self.advance();
let mut regex_parse_steps: usize = 0;
while let Some(ch) = self.current_char() {
regex_parse_steps += 1;
if regex_parse_steps > MAX_REGEX_PARSE_STEPS {
#[cfg(debug_assertions)]
{
let text = &self.input[start..self.position];
let preview = truncate_preview(text, 50);
tracing::debug!(
limit = MAX_REGEX_PARSE_STEPS,
pattern_preview = %preview,
"Regex parse step budget exceeded"
);
}
self.position = self.input.len();
return Some(Token {
token_type: TokenType::UnknownRest,
text: empty_arc(),
start,
end: self.position,
});
}
if let Some(token) = self.budget_guard(start, 0) {
return Some(token);
}
match ch {
'/' => {
self.advance();
while let Some(ch) = self.current_char() {
if ch.is_ascii_alphanumeric() {
self.advance();
} else {
break;
}
}
let text = &self.input[start..self.position];
self.mode = LexerMode::ExpectOperator;
return Some(Token {
token_type: TokenType::RegexMatch,
text: Arc::from(text),
start,
end: self.position,
});
}
'\\' => {
self.advance();
if self.current_char().is_some() {
self.advance();
}
}
_ => self.advance(),
}
}
None
}
}
static EMPTY_ARC: OnceLock<Arc<str>> = OnceLock::new();
#[inline(always)]
fn empty_arc() -> Arc<str> {
EMPTY_ARC.get_or_init(|| Arc::from("")).clone()
}
fn truncate_preview(text: &str, max_chars: usize) -> String {
match text.char_indices().nth(max_chars) {
Some((idx, _)) => format!("{}...", &text[..idx]),
None => text.to_string(),
}
}
#[inline(always)]
fn is_keyword(word: &str) -> bool {
matches!(word.len(), 1..=9) && is_lexer_keyword(word)
}
#[inline]
fn is_builtin_function(word: &str) -> bool {
BARE_TERM_BUILTINS.binary_search(&word).is_ok()
}
const BARE_TERM_BUILTINS: &[&str] = &[
"abs", "chomp", "chop", "chr", "close", "defined", "delete", "each", "exists", "hex", "int",
"join", "keys", "lc", "lcfirst", "length", "oct", "open", "ord", "pack", "print", "push",
"read", "ref", "reverse", "rindex", "say", "scalar", "splice", "sprintf", "sqrt", "substr",
"tie", "uc", "ucfirst", "unpack", "unshift", "untie", "values", "write",
];
const COMPOUND_SECOND_CHARS: &[u8] = b"=<>&|+->.~*:";
#[inline]
fn is_compound_operator(first: char, second: char) -> bool {
if first.is_ascii() && second.is_ascii() {
let first_byte = first as u8;
let second_byte = second as u8;
if !COMPOUND_SECOND_CHARS.contains(&second_byte) {
return false;
}
match (first_byte, second_byte) {
(b'+' | b'-' | b'*' | b'/' | b'%' | b'&' | b'|' | b'^' | b'.', b'=') => true,
(b'<' | b'>' | b'=' | b'!', b'=') => true,
(b'=' | b'!', b'~') => true,
(b'+', b'+') | (b'-', b'-') => true,
(b'&', b'&') | (b'|', b'|') => true,
(b'<', b'<') | (b'>', b'>') => true,
(b'*', b'*')
| (b'/', b'/')
| (b'-' | b'=', b'>')
| (b'.', b'.')
| (b'~', b'~')
| (b':', b':') => true,
_ => false,
}
} else {
matches!(
(first, second),
('+' | '-' | '*' | '/' | '%' | '&' | '|' | '^' | '.' | '<' | '>' | '=' | '!', '=')
| ('=' | '!' | '~', '~')
| ('+', '+')
| ('-', '-' | '>')
| ('&', '&')
| ('|', '|')
| ('<', '<')
| ('>' | '=', '>')
| ('*', '*')
| ('/', '/')
| ('.', '.')
| (':', ':')
)
}
}
impl Checkpointable for PerlLexer<'_> {
fn checkpoint(&self) -> LexerCheckpoint {
use checkpoint::CheckpointContext;
let context = if matches!(self.mode, LexerMode::InFormatBody) {
CheckpointContext::Format {
start_position: self.position.saturating_sub(100), }
} else if !self.delimiter_stack.is_empty() {
CheckpointContext::QuoteLike {
operator: String::new(), delimiter: self.delimiter_stack.last().copied().unwrap_or('\0'),
is_paired: true,
}
} else {
CheckpointContext::Normal
};
LexerCheckpoint {
position: self.position,
mode: self.mode,
delimiter_stack: self.delimiter_stack.clone(),
in_prototype: self.in_prototype,
prototype_depth: self.prototype_depth,
after_sub: self.after_sub,
after_arrow: self.after_arrow,
hash_brace_depth: self.hash_brace_depth,
after_var_subscript: self.after_var_subscript,
paren_depth: self.paren_depth,
current_pos: self.current_pos,
context,
}
}
fn restore(&mut self, checkpoint: &LexerCheckpoint) {
self.position = checkpoint.position;
self.mode = checkpoint.mode;
self.delimiter_stack.clone_from(&checkpoint.delimiter_stack);
self.in_prototype = checkpoint.in_prototype;
self.prototype_depth = checkpoint.prototype_depth;
self.after_sub = checkpoint.after_sub;
self.after_arrow = checkpoint.after_arrow;
self.hash_brace_depth = checkpoint.hash_brace_depth;
self.after_var_subscript = checkpoint.after_var_subscript;
self.paren_depth = checkpoint.paren_depth;
self.current_pos = checkpoint.current_pos;
use checkpoint::CheckpointContext;
if let CheckpointContext::Format { .. } = &checkpoint.context {
if !matches!(self.mode, LexerMode::InFormatBody) {
self.mode = LexerMode::InFormatBody;
}
}
}
fn can_restore(&self, checkpoint: &LexerCheckpoint) -> bool {
checkpoint.position <= self.input.len()
}
}
#[cfg(test)]
mod test_format_debug;
#[cfg(test)]
mod tests {
use super::*;
type TestResult = std::result::Result<(), Box<dyn std::error::Error>>;
#[test]
fn test_basic_tokens() -> TestResult {
let mut lexer = PerlLexer::new("my $x = 42;");
let token = lexer.next_token().ok_or("Expected keyword token")?;
assert_eq!(token.token_type, TokenType::Keyword(Arc::from("my")));
let token = lexer.next_token().ok_or("Expected identifier token")?;
assert!(matches!(token.token_type, TokenType::Identifier(_)));
let token = lexer.next_token().ok_or("Expected operator token")?;
assert!(matches!(token.token_type, TokenType::Operator(_)));
let token = lexer.next_token().ok_or("Expected number token")?;
assert!(matches!(token.token_type, TokenType::Number(_)));
let token = lexer.next_token().ok_or("Expected semicolon token")?;
assert_eq!(token.token_type, TokenType::Semicolon);
Ok(())
}
#[test]
fn test_slash_disambiguation() -> TestResult {
let mut lexer = PerlLexer::new("10 / 2");
lexer.next_token(); let token = lexer.next_token().ok_or("Expected division token")?;
assert_eq!(token.token_type, TokenType::Division);
let mut lexer = PerlLexer::new("if (/pattern/)");
lexer.next_token(); lexer.next_token(); let token = lexer.next_token().ok_or("Expected regex token")?;
assert_eq!(token.token_type, TokenType::RegexMatch);
Ok(())
}
#[test]
fn test_percent_and_double_sigil_disambiguation() -> TestResult {
let mut lexer = PerlLexer::new("%hash");
let token = lexer.next_token().ok_or("Expected hash identifier token")?;
assert!(
matches!(token.token_type, TokenType::Identifier(ref id) if id.as_ref() == "%hash")
);
let mut lexer = PerlLexer::new("10 % 3");
lexer.next_token(); let token = lexer.next_token().ok_or("Expected modulo operator token")?;
assert!(matches!(token.token_type, TokenType::Operator(ref op) if op.as_ref() == "%"));
Ok(())
}
#[test]
fn test_defined_or_and_exponent() -> TestResult {
let mut lexer = PerlLexer::new("$a // $b");
lexer.next_token(); let token = lexer.next_token().ok_or("Expected defined-or operator token")?;
assert!(matches!(token.token_type, TokenType::Operator(ref op) if op.as_ref() == "//"));
let mut lexer = PerlLexer::new("$x =~ //");
lexer.next_token(); lexer.next_token(); let token = lexer.next_token().ok_or("Expected regex token")?;
assert_eq!(token.token_type, TokenType::RegexMatch);
let mut lexer = PerlLexer::new("2 ** 3");
lexer.next_token(); let token = lexer.next_token().ok_or("Expected exponent operator token")?;
assert!(matches!(token.token_type, TokenType::Operator(ref op) if op.as_ref() == "**"));
Ok(())
}
#[test]
fn test_join_regex_disambiguation() -> TestResult {
let mut lexer = PerlLexer::new("join /,/, @parts");
let token = lexer.next_token().ok_or("Expected join token")?;
assert!(matches!(token.token_type, TokenType::Identifier(ref id) if id.as_ref() == "join"));
let token = lexer.next_token().ok_or("Expected regex token")?;
assert_eq!(token.token_type, TokenType::RegexMatch);
Ok(())
}
#[test]
fn test_builtin_regex_disambiguation() -> TestResult {
for code in ["print /pattern/", "defined /pattern/", "keys /pattern/"] {
let mut lexer = PerlLexer::new(code);
lexer.next_token();
let token = lexer.next_token().ok_or("Expected regex token")?;
assert_eq!(token.token_type, TokenType::RegexMatch, "{code}");
}
Ok(())
}
#[test]
fn test_nullary_builtin_division_disambiguation() -> TestResult {
let mut lexer = PerlLexer::new("time / 2");
let token = lexer.next_token().ok_or("Expected time token")?;
assert!(matches!(token.token_type, TokenType::Identifier(ref id) if id.as_ref() == "time"));
let token = lexer.next_token().ok_or("Expected division token")?;
assert_eq!(token.token_type, TokenType::Division);
Ok(())
}
#[test]
fn test_peek_token_does_not_mutate_paren_depth() -> TestResult {
let mut lexer = PerlLexer::new("(1<<2)");
assert_eq!(lexer.paren_depth, 0, "paren_depth must start at 0");
let peeked = lexer.peek_token().ok_or("peek at ( failed")?;
assert_eq!(peeked.token_type, TokenType::LeftParen);
assert_eq!(lexer.paren_depth, 0, "peek_token must not mutate paren_depth");
lexer.next_token();
assert_eq!(lexer.paren_depth, 1);
let peeked2 = lexer.peek_token().ok_or("peek at 1 failed")?;
assert!(matches!(peeked2.token_type, TokenType::Number(_)));
assert_eq!(lexer.paren_depth, 1, "peek at number must not change paren_depth");
Ok(())
}
}