perl_parser_core/engine/parser/mod.rs
1//! Recursive descent Perl parser.
2//!
3//! Consumes tokens from `perl-lexer` and produces AST nodes with error recovery.
4//! The parser handles operator precedence, quote-like operators, and heredocs,
5//! while tracking recursion depth to prevent stack overflows on malformed input.
6//!
7//! # IDE-Friendly Error Recovery
8//!
9//! This parser uses an **IDE-friendly error recovery model**:
10//!
11//! - **Returns `Ok(ast)` with ERROR nodes** for most parse failures (recovered errors)
12//! - **Returns `Err`** only for catastrophic failures (recursion limits, etc.)
13//!
14//! This means `result.is_err()` is **not** the correct way to check for parse errors.
15//! Instead, check for ERROR nodes in the AST or use `parser.errors()`:
16//!
17//! ```rust,ignore
18//! let mut parser = Parser::new(code);
19//! match parser.parse() {
20//! Err(_) => println!("Catastrophic parse failure"),
21//! Ok(ast) => {
22//! // Check for recovered errors via ERROR nodes
23//! if ast.to_sexp().contains("ERROR") {
24//! println!("Parse errors recovered: {:?}", parser.errors());
25//! }
26//! }
27//! }
28//! ```
29//!
30//! ## Why IDE-Friendly?
31//!
32//! Traditional compilers return `Err` on any syntax error. This prevents:
33//! - Code completion in incomplete code
34//! - Go-to-definition while typing
35//! - Hover information in files with errors
36//!
37//! By returning partial ASTs with ERROR nodes, editors can provide useful
38//! features even when code is incomplete or contains errors.
39//!
40//! # Performance
41//!
42//! - **Time complexity**: O(n) for typical token streams
43//! - **Space complexity**: O(n) for AST storage with bounded recursion memory usage
44//! - **Optimizations**: Fast-path parsing and efficient recovery to maintain performance
45//! - **Benchmarks**: ~150µs–1ms for typical files; low ms for large file inputs
46//! - **Large-scale notes**: Tuned to scale for large workspaces (50GB PST-style scans)
47//!
48//! # Usage
49//!
50//! ```rust
51//! use perl_parser_core::Parser;
52//!
53//! let mut parser = Parser::new("my $var = 42; sub hello { print $var; }");
54//! let ast = parser.parse();
55//! ```
56
57use crate::{
58 ast::{Node, NodeKind, SourceLocation},
59 error::{ParseError, ParseOutput, ParseResult, RecoveryKind, RecoverySite},
60 heredoc_collector::{self, HeredocContent, PendingHeredoc, collect_all},
61 quote_parser,
62 token_stream::{Token, TokenKind, TokenStream},
63};
64use std::collections::VecDeque;
65use std::sync::Arc;
66use std::sync::atomic::{AtomicBool, Ordering};
67use std::time::Instant;
68
69/// Strip Perl-style line comments from `qw()` content.
70///
71/// In Perl, `#` inside `qw()` begins a comment that extends to the end of the
72/// line (see perlop: "A # character within the list is treated as a comment
73/// character"). This function removes those comment segments so that
74/// `split_whitespace()` sees only the actual list elements.
75fn strip_qw_comments(content: &str) -> String {
76 content
77 .lines()
78 .map(|line| if let Some(pos) = line.find('#') { &line[..pos] } else { line })
79 .collect::<Vec<_>>()
80 .join("\n")
81}
82
83/// Parser state for a single Perl source input.
84///
85/// Construct with [`Parser::new`] and call [`Parser::parse`] to obtain an AST.
86/// Non-fatal syntax errors are collected and can be accessed via [`Parser::errors`].
87pub struct Parser<'a> {
88 /// Token stream providing access to lexed Perl script content
89 tokens: TokenStream<'a>,
90 /// Current recursion depth for overflow protection during complex Perl script parsing
91 recursion_depth: usize,
92 /// Position tracking for error reporting and AST location information
93 last_end_position: usize,
94 /// Context flag for disambiguating for-loop initialization syntax
95 in_for_loop_init: bool,
96 /// Statement boundary tracking for indirect object syntax detection
97 at_stmt_start: bool,
98 /// FIFO queue of pending heredoc declarations awaiting content collection
99 pending_heredocs: VecDeque<PendingHeredoc>,
100 /// Source bytes for heredoc content collection (shared with token stream)
101 src_bytes: &'a [u8],
102 /// Byte cursor tracking position for heredoc content collection
103 byte_cursor: usize,
104 /// Start time of parsing for timeout enforcement (specifically heredocs)
105 heredoc_start_time: Option<Instant>,
106 /// Collection of parse errors encountered during parsing (for error recovery)
107 errors: Vec<ParseError>,
108 /// Optional cancellation flag for cooperative cancellation from the LSP server.
109 cancellation_flag: Option<Arc<AtomicBool>>,
110 /// Counter to amortize cancellation checks (only check every 64 statements)
111 cancellation_check_counter: usize,
112}
113
114// Recursion limit is set conservatively to prevent stack overflow
115// before the limit triggers. The actual stack usage depends on the
116// number of function frames between recursion checks (about 20-30
117// for the precedence parsing chain). 128 * 30 = ~3840 frames which
118// is safe. Real Perl code rarely exceeds 20-30 nesting levels.
119const MAX_RECURSION_DEPTH: usize = 128;
120
121impl<'a> Parser<'a> {
122 /// Create a new parser for the provided Perl source.
123 ///
124 /// # Arguments
125 ///
126 /// * `input` - Perl source code to be parsed
127 ///
128 /// # Returns
129 ///
130 /// A configured parser ready to parse the provided source.
131 ///
132 /// # Examples
133 ///
134 /// ```rust
135 /// use perl_parser_core::Parser;
136 ///
137 /// let script = "use strict; my $filter = qr/important/;";
138 /// let mut parser = Parser::new(script);
139 /// // Parser ready to parse the source
140 /// ```
141 pub fn new(input: &'a str) -> Self {
142 Parser {
143 tokens: TokenStream::new(input),
144 recursion_depth: 0,
145 last_end_position: 0,
146 in_for_loop_init: false,
147 at_stmt_start: true,
148 pending_heredocs: VecDeque::new(),
149 src_bytes: input.as_bytes(),
150 byte_cursor: 0,
151 heredoc_start_time: None,
152 errors: Vec::new(),
153 cancellation_flag: None,
154 cancellation_check_counter: 0,
155 }
156 }
157
158 /// Create a new parser with a cancellation flag for cooperative cancellation.
159 ///
160 /// When the flag is set to `true`, the parser will return `Err(ParseError::Cancelled)`
161 /// at the next cancellation check point (every 64 statements).
162 pub fn new_with_cancellation(input: &'a str, cancellation_flag: Arc<AtomicBool>) -> Self {
163 let mut p = Parser::new(input);
164 p.cancellation_flag = Some(cancellation_flag);
165 p
166 }
167
168 /// Create a parser from pre-lexed tokens, skipping the lexer pass.
169 ///
170 /// This constructor is the integration point for the incremental parsing
171 /// pipeline: when cached tokens are available for an unchanged region of
172 /// source, they can be fed directly into the parser without re-lexing.
173 ///
174 /// # Arguments
175 ///
176 /// * `tokens` — Pre-lexed `Token` values produced by a prior [`TokenStream`]
177 /// pass. Trivia tokens (whitespace, comments) should already be filtered
178 /// out, as [`TokenStream::from_vec`] does not apply trivia skipping.
179 /// An `Eof` token does **not** need to be included; the stream synthesises
180 /// one when the buffer is exhausted.
181 /// * `source` — The original Perl source text. This is still required for
182 /// heredoc content collection which operates directly on byte offsets in
183 /// the source rather than on the token stream.
184 ///
185 /// # Returns
186 ///
187 /// A configured parser that will consume `tokens` in order without invoking
188 /// the lexer. The resulting AST is structurally identical to one produced by
189 /// [`Parser::new`] with the same source, provided the token list is complete
190 /// and accurate.
191 ///
192 /// # Context-sensitive token disambiguation
193 ///
194 /// The standard parser uses `relex_as_term` to re-lex ambiguous tokens (e.g.
195 /// `/` as division vs. regex) in context-sensitive positions. When using
196 /// pre-lexed tokens the kind is fixed from the original lex pass, so the
197 /// original parse context must have been correct. In practice this means
198 /// `from_tokens` is safe to use when the token stream comes from a previous
199 /// successful parse of the same source.
200 ///
201 /// # Examples
202 ///
203 /// ```rust,ignore
204 /// use perl_parser_core::{Parser, Token, TokenKind, TokenStream};
205 ///
206 /// let source = "my $x = 42;";
207 ///
208 /// // Collect pre-lexed tokens (normally cached from a prior parse)
209 /// let mut stream = TokenStream::new(source);
210 /// let mut tokens = Vec::new();
211 /// loop {
212 /// match stream.next() {
213 /// Ok(t) if t.kind == TokenKind::Eof => break,
214 /// Ok(t) => tokens.push(t),
215 /// Err(_) => break,
216 /// }
217 /// }
218 ///
219 /// let mut parser = Parser::from_tokens(tokens, source);
220 /// let ast = parser.parse()?;
221 /// assert!(matches!(ast.kind, perl_parser_core::NodeKind::Program { .. }));
222 /// # Ok::<(), perl_parser_core::ParseError>(())
223 /// ```
224 pub fn from_tokens(tokens: Vec<Token>, source: &'a str) -> Self {
225 Parser {
226 tokens: TokenStream::from_vec(tokens),
227 recursion_depth: 0,
228 last_end_position: 0,
229 in_for_loop_init: false,
230 at_stmt_start: true,
231 pending_heredocs: VecDeque::new(),
232 src_bytes: source.as_bytes(),
233 byte_cursor: 0,
234 heredoc_start_time: None,
235 errors: Vec::new(),
236 cancellation_flag: None,
237 cancellation_check_counter: 0,
238 }
239 }
240
241 /// Check for cooperative cancellation, amortised over every 64 calls.
242 ///
243 /// Returns `Err(ParseError::Cancelled)` if the cancellation flag has been set.
244 #[inline]
245 fn check_cancelled(&mut self) -> ParseResult<()> {
246 self.cancellation_check_counter = self.cancellation_check_counter.wrapping_add(1);
247 if self.cancellation_check_counter & 63 == 0 {
248 if let Some(ref flag) = self.cancellation_flag {
249 if flag.load(Ordering::Relaxed) {
250 return Err(ParseError::Cancelled);
251 }
252 }
253 }
254 Ok(())
255 }
256
257 /// Create a new parser with custom enhanced recovery configuration.
258 ///
259 /// This constructor exists for API compatibility while enhanced recovery
260 /// configuration is being phased in.
261 ///
262 /// # Arguments
263 ///
264 /// * `input` - Perl source text to tokenize and parse.
265 /// * `_config` - Placeholder recovery configuration parameter.
266 ///
267 /// # Returns
268 ///
269 /// A parser instance initialized for the provided source text.
270 ///
271 /// # Examples
272 ///
273 /// ```rust
274 /// use perl_parser_core::Parser;
275 ///
276 /// let parser = Parser::new_with_recovery_config("my $x = 1;", ());
277 /// assert_eq!(parser.errors().len(), 0);
278 /// ```
279 pub fn new_with_recovery_config(input: &'a str, _config: ()) -> Self {
280 Parser::new(input)
281 }
282
283 /// Parse the source and return the AST for the Parse stage.
284 ///
285 /// # Returns
286 ///
287 /// * `Ok(Node)` - Parsed AST with a `Program` root node.
288 /// * `Err(ParseError)` - Non-recoverable parsing failure.
289 ///
290 /// # Errors
291 ///
292 /// Returns `ParseError` for non-recoverable conditions such as recursion limits.
293 ///
294 /// # Examples
295 ///
296 /// ```rust
297 /// use perl_parser_core::Parser;
298 ///
299 /// let mut parser = Parser::new("my $count = 1;");
300 /// let ast = parser.parse()?;
301 /// assert!(matches!(ast.kind, perl_parser_core::NodeKind::Program { .. }));
302 /// # Ok::<(), perl_parser_core::ParseError>(())
303 /// ```
304 pub fn parse(&mut self) -> ParseResult<Node> {
305 // Check cancellation before starting — handles pre-set flags immediately.
306 if let Some(ref flag) = self.cancellation_flag {
307 if flag.load(Ordering::Relaxed) {
308 return Err(ParseError::Cancelled);
309 }
310 }
311 self.parse_program()
312 }
313
314 /// Get all parse errors collected during parsing
315 ///
316 /// When error recovery is enabled, the parser continues after syntax errors
317 /// and collects them for later retrieval. This is useful for IDE integration
318 /// where you want to show all errors at once.
319 ///
320 /// # Returns
321 ///
322 /// A slice of all `ParseError`s encountered during parsing
323 ///
324 /// # Examples
325 ///
326 /// ```rust
327 /// use perl_parser_core::Parser;
328 ///
329 /// let mut parser = Parser::new("my $x = ; sub foo {");
330 /// let _ast = parser.parse(); // Parse with recovery
331 /// let errors = parser.errors();
332 /// // errors will contain details about syntax errors
333 /// ```
334 pub fn errors(&self) -> &[ParseError] {
335 &self.errors
336 }
337
338 /// Parse with error recovery and return comprehensive output.
339 ///
340 /// This method is preferred for LSP Analyze workflows and always returns
341 /// a `ParseOutput` containing the AST and any collected diagnostics.
342 ///
343 /// # Returns
344 ///
345 /// `ParseOutput` with the AST and diagnostics collected during parsing.
346 ///
347 /// # Examples
348 ///
349 /// ```rust
350 /// use perl_parser_core::Parser;
351 ///
352 /// let mut parser = Parser::new("my $x = ;");
353 /// let output = parser.parse_with_recovery();
354 /// assert!(!output.diagnostics.is_empty() || matches!(output.ast.kind, perl_parser_core::NodeKind::Program { .. }));
355 /// ```
356 pub fn parse_with_recovery(&mut self) -> ParseOutput {
357 let ast = match self.parse() {
358 Ok(node) => node,
359 Err(e) => {
360 // If parse() returned Err, it was a non-recoverable error (e.g. recursion limit)
361 // Ensure it's recorded if not already
362 if !self.errors.contains(&e) {
363 self.errors.push(e.clone());
364 }
365
366 // Return a dummy Program node with the error
367 Node::new(
368 NodeKind::Program { statements: vec![] },
369 SourceLocation { start: 0, end: 0 },
370 )
371 }
372 };
373
374 ParseOutput::with_errors(ast, self.errors.clone())
375 }
376}
377
378include!("helpers.rs");
379include!("heredoc.rs");
380include!("statements.rs");
381include!("variables.rs");
382include!("control_flow.rs");
383include!("declarations.rs");
384include!("expressions/mod.rs");
385include!("expressions/precedence.rs");
386include!("expressions/unary.rs");
387include!("expressions/postfix.rs");
388include!("expressions/primary.rs");
389include!("expressions/calls.rs");
390include!("expressions/hashes.rs");
391include!("expressions/quotes.rs");
392
393#[cfg(test)]
394mod builtin_block_list_tests;
395#[cfg(test)]
396mod builtin_expansion_tests;
397#[cfg(test)]
398mod chained_deref_method_tests;
399#[cfg(test)]
400mod coderef_invocation_tests;
401#[cfg(test)]
402mod complex_args_tests;
403#[cfg(test)]
404mod control_flow_expr_tests;
405#[cfg(test)]
406mod declaration_in_args_tests;
407#[cfg(test)]
408mod error_recovery_tests;
409#[cfg(test)]
410mod eval_goto_tests;
411#[cfg(test)]
412mod for_builtin_block_tests;
413#[cfg(test)]
414mod format_comprehensive_tests;
415#[cfg(test)]
416mod format_tests;
417#[cfg(test)]
418mod forward_declaration_tests;
419#[cfg(test)]
420mod from_tokens_tests;
421#[cfg(test)]
422mod glob_assignment_tests;
423#[cfg(test)]
424mod glob_tests;
425#[cfg(test)]
426mod hash_vs_block_tests;
427#[cfg(test)]
428mod heredoc_security_tests;
429#[cfg(test)]
430mod indirect_call_tests;
431#[cfg(test)]
432mod indirect_object_tests;
433#[cfg(test)]
434mod loop_control_tests;
435#[cfg(test)]
436mod qualified_variable_subscript_tests;
437#[cfg(test)]
438mod regex_delimiter_tests;
439#[cfg(test)]
440mod slash_ambiguity_tests;
441#[cfg(test)]
442mod statement_modifier_tests;
443#[cfg(test)]
444mod tests;
445#[cfg(test)]
446mod tie_tests;
447#[cfg(test)]
448mod use_overload_tests;
449#[cfg(test)]
450mod x_repetition_tests;
451
452#[cfg(test)]
453mod strip_qw_comments_unit_tests {
454 use super::strip_qw_comments;
455
456 #[test]
457 fn test_strip_basic() {
458 let result = strip_qw_comments("foo # comment\n bar");
459 assert_eq!(result.split_whitespace().collect::<Vec<_>>(), vec!["foo", "bar"]);
460 }
461}