Skip to main content

perl_parser_core/engine/parser/
mod.rs

1//! Recursive descent Perl parser.
2//!
3//! Consumes tokens from `perl-lexer` and produces AST nodes with error recovery.
4//! The parser handles operator precedence, quote-like operators, and heredocs,
5//! while tracking recursion depth to prevent stack overflows on malformed input.
6//!
7//! # IDE-Friendly Error Recovery
8//!
9//! This parser uses an **IDE-friendly error recovery model**:
10//!
11//! - **Returns `Ok(ast)` with ERROR nodes** for most parse failures (recovered errors)
12//! - **Returns `Err`** only for catastrophic failures (recursion limits, etc.)
13//!
14//! This means `result.is_err()` is **not** the correct way to check for parse errors.
15//! Instead, check for ERROR nodes in the AST or use `parser.errors()`:
16//!
17//! ```rust,ignore
18//! let mut parser = Parser::new(code);
19//! match parser.parse() {
20//!     Err(_) => println!("Catastrophic parse failure"),
21//!     Ok(ast) => {
22//!         // Check for recovered errors via ERROR nodes
23//!         if ast.to_sexp().contains("ERROR") {
24//!             println!("Parse errors recovered: {:?}", parser.errors());
25//!         }
26//!     }
27//! }
28//! ```
29//!
30//! ## Why IDE-Friendly?
31//!
32//! Traditional compilers return `Err` on any syntax error. This prevents:
33//! - Code completion in incomplete code
34//! - Go-to-definition while typing
35//! - Hover information in files with errors
36//!
37//! By returning partial ASTs with ERROR nodes, editors can provide useful
38//! features even when code is incomplete or contains errors.
39//!
40//! # Performance
41//!
42//! - **Time complexity**: O(n) for typical token streams
43//! - **Space complexity**: O(n) for AST storage with bounded recursion memory usage
44//! - **Optimizations**: Fast-path parsing and efficient recovery to maintain performance
45//! - **Benchmarks**: ~150µs–1ms for typical files; low ms for large file inputs
46//! - **Large-scale notes**: Tuned to scale for large workspaces (50GB PST-style scans)
47//!
48//! # Usage
49//!
50//! ```rust
51//! use perl_parser_core::Parser;
52//!
53//! let mut parser = Parser::new("my $var = 42; sub hello { print $var; }");
54//! let ast = parser.parse();
55//! ```
56
57use crate::{
58    ast::{Node, NodeKind, SourceLocation},
59    error::{ParseError, ParseOutput, ParseResult, RecoveryKind, RecoverySite},
60    heredoc_collector::{self, HeredocContent, PendingHeredoc, collect_all},
61    quote_parser,
62    token_stream::{Token, TokenKind, TokenStream},
63};
64use std::collections::VecDeque;
65use std::sync::Arc;
66use std::sync::atomic::{AtomicBool, Ordering};
67use std::time::Instant;
68
69/// Strip Perl-style line comments from `qw()` content.
70///
71/// In Perl, `#` inside `qw()` begins a comment that extends to the end of the
72/// line (see perlop: "A # character within the list is treated as a comment
73/// character"). This function removes those comment segments so that
74/// `split_whitespace()` sees only the actual list elements.
75fn strip_qw_comments(content: &str) -> String {
76    content
77        .lines()
78        .map(|line| if let Some(pos) = line.find('#') { &line[..pos] } else { line })
79        .collect::<Vec<_>>()
80        .join("\n")
81}
82
83/// Parser state for a single Perl source input.
84///
85/// Construct with [`Parser::new`] and call [`Parser::parse`] to obtain an AST.
86/// Non-fatal syntax errors are collected and can be accessed via [`Parser::errors`].
87pub struct Parser<'a> {
88    /// Token stream providing access to lexed Perl script content
89    tokens: TokenStream<'a>,
90    /// Current recursion depth for overflow protection during complex Perl script parsing
91    recursion_depth: usize,
92    /// Position tracking for error reporting and AST location information
93    last_end_position: usize,
94    /// Context flag for disambiguating for-loop initialization syntax
95    in_for_loop_init: bool,
96    /// Statement boundary tracking for indirect object syntax detection
97    at_stmt_start: bool,
98    /// FIFO queue of pending heredoc declarations awaiting content collection
99    pending_heredocs: VecDeque<PendingHeredoc>,
100    /// Source bytes for heredoc content collection (shared with token stream)
101    src_bytes: &'a [u8],
102    /// Byte cursor tracking position for heredoc content collection
103    byte_cursor: usize,
104    /// Start time of parsing for timeout enforcement (specifically heredocs)
105    heredoc_start_time: Option<Instant>,
106    /// Collection of parse errors encountered during parsing (for error recovery)
107    errors: Vec<ParseError>,
108    /// Optional cancellation flag for cooperative cancellation from the LSP server.
109    cancellation_flag: Option<Arc<AtomicBool>>,
110    /// Counter to amortize cancellation checks (only check every 64 statements)
111    cancellation_check_counter: usize,
112}
113
114// Recursion limit is set conservatively to prevent stack overflow
115// before the limit triggers. The actual stack usage depends on the
116// number of function frames between recursion checks (about 20-30
117// for the precedence parsing chain). 128 * 30 = ~3840 frames which
118// is safe. Real Perl code rarely exceeds 20-30 nesting levels.
119const MAX_RECURSION_DEPTH: usize = 128;
120
121impl<'a> Parser<'a> {
122    /// Create a new parser for the provided Perl source.
123    ///
124    /// # Arguments
125    ///
126    /// * `input` - Perl source code to be parsed
127    ///
128    /// # Returns
129    ///
130    /// A configured parser ready to parse the provided source.
131    ///
132    /// # Examples
133    ///
134    /// ```rust
135    /// use perl_parser_core::Parser;
136    ///
137    /// let script = "use strict; my $filter = qr/important/;";
138    /// let mut parser = Parser::new(script);
139    /// // Parser ready to parse the source
140    /// ```
141    pub fn new(input: &'a str) -> Self {
142        Parser {
143            tokens: TokenStream::new(input),
144            recursion_depth: 0,
145            last_end_position: 0,
146            in_for_loop_init: false,
147            at_stmt_start: true,
148            pending_heredocs: VecDeque::new(),
149            src_bytes: input.as_bytes(),
150            byte_cursor: 0,
151            heredoc_start_time: None,
152            errors: Vec::new(),
153            cancellation_flag: None,
154            cancellation_check_counter: 0,
155        }
156    }
157
158    /// Create a new parser with a cancellation flag for cooperative cancellation.
159    ///
160    /// When the flag is set to `true`, the parser will return `Err(ParseError::Cancelled)`
161    /// at the next cancellation check point (every 64 statements).
162    pub fn new_with_cancellation(input: &'a str, cancellation_flag: Arc<AtomicBool>) -> Self {
163        let mut p = Parser::new(input);
164        p.cancellation_flag = Some(cancellation_flag);
165        p
166    }
167
168    /// Check for cooperative cancellation, amortised over every 64 calls.
169    ///
170    /// Returns `Err(ParseError::Cancelled)` if the cancellation flag has been set.
171    #[inline]
172    fn check_cancelled(&mut self) -> ParseResult<()> {
173        self.cancellation_check_counter = self.cancellation_check_counter.wrapping_add(1);
174        if self.cancellation_check_counter & 63 == 0 {
175            if let Some(ref flag) = self.cancellation_flag {
176                if flag.load(Ordering::Relaxed) {
177                    return Err(ParseError::Cancelled);
178                }
179            }
180        }
181        Ok(())
182    }
183
184    /// Create a new parser with custom enhanced recovery configuration.
185    ///
186    /// This constructor exists for API compatibility while enhanced recovery
187    /// configuration is being phased in.
188    ///
189    /// # Arguments
190    ///
191    /// * `input` - Perl source text to tokenize and parse.
192    /// * `_config` - Placeholder recovery configuration parameter.
193    ///
194    /// # Returns
195    ///
196    /// A parser instance initialized for the provided source text.
197    ///
198    /// # Examples
199    ///
200    /// ```rust
201    /// use perl_parser_core::Parser;
202    ///
203    /// let parser = Parser::new_with_recovery_config("my $x = 1;", ());
204    /// assert_eq!(parser.errors().len(), 0);
205    /// ```
206    pub fn new_with_recovery_config(input: &'a str, _config: ()) -> Self {
207        Parser::new(input)
208    }
209
210    /// Parse the source and return the AST for the Parse stage.
211    ///
212    /// # Returns
213    ///
214    /// * `Ok(Node)` - Parsed AST with a `Program` root node.
215    /// * `Err(ParseError)` - Non-recoverable parsing failure.
216    ///
217    /// # Errors
218    ///
219    /// Returns `ParseError` for non-recoverable conditions such as recursion limits.
220    ///
221    /// # Examples
222    ///
223    /// ```rust
224    /// use perl_parser_core::Parser;
225    ///
226    /// let mut parser = Parser::new("my $count = 1;");
227    /// let ast = parser.parse()?;
228    /// assert!(matches!(ast.kind, perl_parser_core::NodeKind::Program { .. }));
229    /// # Ok::<(), perl_parser_core::ParseError>(())
230    /// ```
231    pub fn parse(&mut self) -> ParseResult<Node> {
232        // Check cancellation before starting — handles pre-set flags immediately.
233        if let Some(ref flag) = self.cancellation_flag {
234            if flag.load(Ordering::Relaxed) {
235                return Err(ParseError::Cancelled);
236            }
237        }
238        self.parse_program()
239    }
240
241    /// Get all parse errors collected during parsing
242    ///
243    /// When error recovery is enabled, the parser continues after syntax errors
244    /// and collects them for later retrieval. This is useful for IDE integration
245    /// where you want to show all errors at once.
246    ///
247    /// # Returns
248    ///
249    /// A slice of all `ParseError`s encountered during parsing
250    ///
251    /// # Examples
252    ///
253    /// ```rust
254    /// use perl_parser_core::Parser;
255    ///
256    /// let mut parser = Parser::new("my $x = ; sub foo {");
257    /// let _ast = parser.parse(); // Parse with recovery
258    /// let errors = parser.errors();
259    /// // errors will contain details about syntax errors
260    /// ```
261    pub fn errors(&self) -> &[ParseError] {
262        &self.errors
263    }
264
265    /// Parse with error recovery and return comprehensive output.
266    ///
267    /// This method is preferred for LSP Analyze workflows and always returns
268    /// a `ParseOutput` containing the AST and any collected diagnostics.
269    ///
270    /// # Returns
271    ///
272    /// `ParseOutput` with the AST and diagnostics collected during parsing.
273    ///
274    /// # Examples
275    ///
276    /// ```rust
277    /// use perl_parser_core::Parser;
278    ///
279    /// let mut parser = Parser::new("my $x = ;");
280    /// let output = parser.parse_with_recovery();
281    /// assert!(!output.diagnostics.is_empty() || matches!(output.ast.kind, perl_parser_core::NodeKind::Program { .. }));
282    /// ```
283    pub fn parse_with_recovery(&mut self) -> ParseOutput {
284        let ast = match self.parse() {
285            Ok(node) => node,
286            Err(e) => {
287                // If parse() returned Err, it was a non-recoverable error (e.g. recursion limit)
288                // Ensure it's recorded if not already
289                if !self.errors.contains(&e) {
290                    self.errors.push(e.clone());
291                }
292
293                // Return a dummy Program node with the error
294                Node::new(
295                    NodeKind::Program { statements: vec![] },
296                    SourceLocation { start: 0, end: 0 },
297                )
298            }
299        };
300
301        ParseOutput::with_errors(ast, self.errors.clone())
302    }
303}
304
305include!("helpers.rs");
306include!("heredoc.rs");
307include!("statements.rs");
308include!("variables.rs");
309include!("control_flow.rs");
310include!("declarations.rs");
311include!("expressions/mod.rs");
312include!("expressions/precedence.rs");
313include!("expressions/unary.rs");
314include!("expressions/postfix.rs");
315include!("expressions/primary.rs");
316include!("expressions/calls.rs");
317include!("expressions/hashes.rs");
318include!("expressions/quotes.rs");
319
320#[cfg(test)]
321mod builtin_block_list_tests;
322#[cfg(test)]
323mod builtin_expansion_tests;
324#[cfg(test)]
325mod chained_deref_method_tests;
326#[cfg(test)]
327mod coderef_invocation_tests;
328#[cfg(test)]
329mod complex_args_tests;
330#[cfg(test)]
331mod control_flow_expr_tests;
332#[cfg(test)]
333mod declaration_in_args_tests;
334#[cfg(test)]
335mod error_recovery_tests;
336#[cfg(test)]
337mod eval_goto_tests;
338#[cfg(test)]
339mod for_builtin_block_tests;
340#[cfg(test)]
341mod format_comprehensive_tests;
342#[cfg(test)]
343mod format_tests;
344#[cfg(test)]
345mod forward_declaration_tests;
346#[cfg(test)]
347mod glob_assignment_tests;
348#[cfg(test)]
349mod glob_tests;
350#[cfg(test)]
351mod hash_vs_block_tests;
352#[cfg(test)]
353mod heredoc_security_tests;
354#[cfg(test)]
355mod indirect_call_tests;
356#[cfg(test)]
357mod indirect_object_tests;
358#[cfg(test)]
359mod loop_control_tests;
360#[cfg(test)]
361mod qualified_variable_subscript_tests;
362#[cfg(test)]
363mod regex_delimiter_tests;
364#[cfg(test)]
365mod slash_ambiguity_tests;
366#[cfg(test)]
367mod statement_modifier_tests;
368#[cfg(test)]
369mod tests;
370#[cfg(test)]
371mod tie_tests;
372#[cfg(test)]
373mod use_overload_tests;
374#[cfg(test)]
375mod x_repetition_tests;
376
377#[cfg(test)]
378mod strip_qw_comments_unit_tests {
379    use super::strip_qw_comments;
380
381    #[test]
382    fn test_strip_basic() {
383        let result = strip_qw_comments("foo # comment\n bar");
384        assert_eq!(result.split_whitespace().collect::<Vec<_>>(), vec!["foo", "bar"]);
385    }
386}