Skip to main content

perl_parser_core/engine/parser/
mod.rs

1//! Recursive descent Perl parser.
2//!
3//! Consumes tokens from `perl-lexer` and produces AST nodes with error recovery.
4//! The parser handles operator precedence, quote-like operators, and heredocs,
5//! while tracking recursion depth to prevent stack overflows on malformed input.
6//!
7//! # IDE-Friendly Error Recovery
8//!
9//! This parser uses an **IDE-friendly error recovery model**:
10//!
11//! - **Returns `Ok(ast)` with ERROR nodes** for most parse failures (recovered errors)
12//! - **Returns `Err`** only for catastrophic failures (recursion limits, etc.)
13//!
14//! This means `result.is_err()` is **not** the correct way to check for parse errors.
15//! Instead, check for ERROR nodes in the AST or use `parser.errors()`:
16//!
17//! ```rust,ignore
18//! let mut parser = Parser::new(code);
19//! match parser.parse() {
20//!     Err(_) => println!("Catastrophic parse failure"),
21//!     Ok(ast) => {
22//!         // Check for recovered errors via ERROR nodes
23//!         if ast.to_sexp().contains("ERROR") {
24//!             println!("Parse errors recovered: {:?}", parser.errors());
25//!         }
26//!     }
27//! }
28//! ```
29//!
30//! ## Why IDE-Friendly?
31//!
32//! Traditional compilers return `Err` on any syntax error. This prevents:
33//! - Code completion in incomplete code
34//! - Go-to-definition while typing
35//! - Hover information in files with errors
36//!
37//! By returning partial ASTs with ERROR nodes, editors can provide useful
38//! features even when code is incomplete or contains errors.
39//!
40//! # Performance
41//!
42//! - **Time complexity**: O(n) for typical token streams
43//! - **Space complexity**: O(n) for AST storage with bounded recursion memory usage
44//! - **Optimizations**: Fast-path parsing and efficient recovery to maintain performance
45//! - **Benchmarks**: ~150µs–1ms for typical files; low ms for large file inputs
46//! - **Large-scale notes**: Tuned to scale for large workspaces (50GB PST-style scans)
47//!
48//! # Usage
49//!
50//! ```rust
51//! use perl_parser_core::Parser;
52//!
53//! let mut parser = Parser::new("my $var = 42; sub hello { print $var; }");
54//! let ast = parser.parse();
55//! ```
56
57use crate::{
58    ast::{Node, NodeKind, SourceLocation},
59    error::{ParseError, ParseOutput, ParseResult, RecoveryKind, RecoverySite},
60    heredoc_collector::{self, HeredocContent, PendingHeredoc, collect_all},
61    quote_parser,
62    token_stream::{Token, TokenKind, TokenStream},
63};
64use std::collections::{HashSet, VecDeque};
65use std::sync::Arc;
66use std::sync::atomic::{AtomicBool, Ordering};
67use std::time::Instant;
68
69/// Strip Perl-style line comments from `qw()` content.
70///
71/// In Perl, `#` inside `qw()` begins a comment that extends to the end of the
72/// line (see perlop: "A # character within the list is treated as a comment
73/// character"). This function removes those comment segments so that
74/// `split_whitespace()` sees only the actual list elements.
75fn strip_qw_comments(content: &str) -> String {
76    content
77        .lines()
78        .map(|line| if let Some(pos) = line.find('#') { &line[..pos] } else { line })
79        .collect::<Vec<_>>()
80        .join("\n")
81}
82
83/// Parser state for a single Perl source input.
84///
85/// Construct with [`Parser::new`] and call [`Parser::parse`] to obtain an AST.
86/// Non-fatal syntax errors are collected and can be accessed via [`Parser::errors`].
87pub struct Parser<'a> {
88    /// Token stream providing access to lexed Perl script content
89    tokens: TokenStream<'a>,
90    /// Current recursion depth for overflow protection during complex Perl script parsing
91    recursion_depth: usize,
92    /// Position tracking for error reporting and AST location information
93    last_end_position: usize,
94    /// Context flag for disambiguating for-loop initialization syntax
95    in_for_loop_init: bool,
96    /// Depth of nested class bodies for context-sensitive class-body constructs
97    in_class_body: usize,
98    /// Statement boundary tracking for indirect object syntax detection
99    at_stmt_start: bool,
100    /// FIFO queue of pending heredoc declarations awaiting content collection
101    pending_heredocs: VecDeque<PendingHeredoc>,
102    /// Custom attributes registered by Attribute::Handlers declarations in this file.
103    custom_attribute_handlers: HashSet<String>,
104    /// Whether `use Attribute::Handlers;` has been seen in this file.
105    attribute_handlers_enabled: bool,
106    /// Source bytes for heredoc content collection (shared with token stream)
107    src_bytes: &'a [u8],
108    /// Byte cursor tracking position for heredoc content collection
109    byte_cursor: usize,
110    /// Start time of parsing for timeout enforcement (specifically heredocs)
111    heredoc_start_time: Option<Instant>,
112    /// Collection of parse errors encountered during parsing (for error recovery)
113    errors: Vec<ParseError>,
114    /// Optional cancellation flag for cooperative cancellation from the LSP server.
115    cancellation_flag: Option<Arc<AtomicBool>>,
116    /// Counter to amortize cancellation checks (only check every 64 statements)
117    cancellation_check_counter: usize,
118}
119
120// Recursion limit is set conservatively to prevent stack overflow
121// before the limit triggers. The actual stack usage depends on the
122// number of function frames between recursion checks (about 20-30
123// for the precedence parsing chain). 128 * 30 = ~3840 frames which
124// is safe. Real Perl code rarely exceeds 20-30 nesting levels.
125const MAX_RECURSION_DEPTH: usize = 128;
126
127impl<'a> Parser<'a> {
128    /// Create a new parser for the provided Perl source.
129    ///
130    /// # Arguments
131    ///
132    /// * `input` - Perl source code to be parsed
133    ///
134    /// # Returns
135    ///
136    /// A configured parser ready to parse the provided source.
137    ///
138    /// # Examples
139    ///
140    /// ```rust
141    /// use perl_parser_core::Parser;
142    ///
143    /// let script = "use strict; my $filter = qr/important/;";
144    /// let mut parser = Parser::new(script);
145    /// // Parser ready to parse the source
146    /// ```
147    pub fn new(input: &'a str) -> Self {
148        Parser {
149            tokens: TokenStream::new(input),
150            recursion_depth: 0,
151            last_end_position: 0,
152            in_for_loop_init: false,
153            in_class_body: 0,
154            at_stmt_start: true,
155            pending_heredocs: VecDeque::new(),
156            custom_attribute_handlers: HashSet::new(),
157            attribute_handlers_enabled: false,
158            src_bytes: input.as_bytes(),
159            byte_cursor: 0,
160            heredoc_start_time: None,
161            errors: Vec::new(),
162            cancellation_flag: None,
163            cancellation_check_counter: 0,
164        }
165    }
166
167    /// Create a new parser with a cancellation flag for cooperative cancellation.
168    ///
169    /// When the flag is set to `true`, the parser will return `Err(ParseError::Cancelled)`
170    /// at the next cancellation check point (every 64 statements).
171    ///
172    /// # Arguments
173    ///
174    /// * `input` - Perl source code to parse.
175    /// * `cancellation_flag` - Shared flag used to request cancellation.
176    ///
177    /// # Returns
178    ///
179    /// A parser configured with cooperative cancellation checks.
180    ///
181    /// # Examples
182    ///
183    /// ```rust
184    /// use perl_parser_core::Parser;
185    /// use std::sync::{
186    ///     atomic::AtomicBool,
187    ///     Arc,
188    /// };
189    ///
190    /// let cancellation_flag = Arc::new(AtomicBool::new(false));
191    /// let mut parser = Parser::new_with_cancellation("my $x = 1;", cancellation_flag);
192    /// let _ = parser.parse();
193    /// ```
194    ///
195    /// # Arguments
196    ///
197    /// `input` and `cancellation_flag` configure source + cancellation.
198    ///
199    /// # Returns
200    ///
201    /// A parser configured with cooperative cancellation checks.
202    ///
203    /// # Examples
204    ///
205    /// See the cancellation usage example above.
206    pub fn new_with_cancellation(input: &'a str, cancellation_flag: Arc<AtomicBool>) -> Self {
207        let mut p = Parser::new(input);
208        p.cancellation_flag = Some(cancellation_flag);
209        p
210    }
211
212    /// Create a parser from pre-lexed tokens, skipping the lexer pass.
213    ///
214    /// This constructor is the integration point for the incremental parsing
215    /// pipeline: when cached tokens are available for an unchanged region of
216    /// source, they can be fed directly into the parser without re-lexing.
217    ///
218    /// # Arguments
219    ///
220    /// * `tokens` — Pre-lexed `Token` values produced by a prior [`TokenStream`]
221    ///   pass. Trivia tokens (whitespace, comments) should already be filtered
222    ///   out, as [`TokenStream::from_vec`] does not apply trivia skipping.
223    ///   An `Eof` token does **not** need to be included; the stream synthesises
224    ///   one when the buffer is exhausted.
225    /// * `source` — The original Perl source text. This is still required for
226    ///   heredoc content collection which operates directly on byte offsets in
227    ///   the source rather than on the token stream.
228    ///
229    /// # Returns
230    ///
231    /// A configured parser that will consume `tokens` in order without invoking
232    /// the lexer. The resulting AST is structurally identical to one produced by
233    /// [`Parser::new`] with the same source, provided the token list is complete
234    /// and accurate.
235    ///
236    /// # Context-sensitive token disambiguation
237    ///
238    /// The standard parser uses `relex_as_term` to re-lex ambiguous tokens (e.g.
239    /// `/` as division vs. regex) in context-sensitive positions. When using
240    /// pre-lexed tokens the kind is fixed from the original lex pass, so the
241    /// original parse context must have been correct. In practice this means
242    /// `from_tokens` is safe to use when the token stream comes from a previous
243    /// successful parse of the same source.
244    ///
245    /// # Examples
246    ///
247    /// ```rust,ignore
248    /// use perl_parser_core::{Parser, Token, TokenKind, TokenStream};
249    ///
250    /// let source = "my $x = 42;";
251    ///
252    /// // Collect pre-lexed tokens (normally cached from a prior parse)
253    /// let mut stream = TokenStream::new(source);
254    /// let mut tokens = Vec::new();
255    /// loop {
256    ///     match stream.next() {
257    ///         Ok(t) if t.kind == TokenKind::Eof => break,
258    ///         Ok(t) => tokens.push(t),
259    ///         Err(_) => break,
260    ///     }
261    /// }
262    ///
263    /// let mut parser = Parser::from_tokens(tokens, source);
264    /// let ast = parser.parse()?;
265    /// assert!(matches!(ast.kind, perl_parser_core::NodeKind::Program { .. }));
266    /// # Ok::<(), perl_parser_core::ParseError>(())
267    /// ```
268    ///
269    /// # Arguments
270    ///
271    /// * `tokens` - Pre-lexed non-trivia tokens.
272    /// * `source` - Original source text used by heredoc processing.
273    ///
274    /// # Returns
275    ///
276    /// A parser that consumes the provided token vector.
277    ///
278    /// # Examples
279    ///
280    /// See the pre-lexed token example above.
281    pub fn from_tokens(tokens: Vec<Token>, source: &'a str) -> Self {
282        Parser {
283            tokens: TokenStream::from_vec(tokens),
284            recursion_depth: 0,
285            last_end_position: 0,
286            in_for_loop_init: false,
287            in_class_body: 0,
288            at_stmt_start: true,
289            pending_heredocs: VecDeque::new(),
290            custom_attribute_handlers: HashSet::new(),
291            attribute_handlers_enabled: false,
292            src_bytes: source.as_bytes(),
293            byte_cursor: 0,
294            heredoc_start_time: None,
295            errors: Vec::new(),
296            cancellation_flag: None,
297            cancellation_check_counter: 0,
298        }
299    }
300
301    /// Check for cooperative cancellation, amortised over every 64 calls.
302    ///
303    /// Returns `Err(ParseError::Cancelled)` if the cancellation flag has been set.
304    #[inline]
305    fn check_cancelled(&mut self) -> ParseResult<()> {
306        self.cancellation_check_counter = self.cancellation_check_counter.wrapping_add(1);
307        if self.cancellation_check_counter & 63 == 0 {
308            if let Some(ref flag) = self.cancellation_flag {
309                if flag.load(Ordering::Relaxed) {
310                    return Err(ParseError::Cancelled);
311                }
312            }
313        }
314        Ok(())
315    }
316
317    /// Create a new parser with custom enhanced recovery configuration.
318    ///
319    /// This constructor exists for API compatibility while enhanced recovery
320    /// configuration is being phased in.
321    ///
322    /// # Arguments
323    ///
324    /// * `input` - Perl source text to tokenize and parse.
325    /// * `_config` - Placeholder recovery configuration parameter.
326    ///
327    /// # Returns
328    ///
329    /// A parser instance initialized for the provided source text.
330    ///
331    /// # Examples
332    ///
333    /// ```rust
334    /// use perl_parser_core::Parser;
335    ///
336    /// let parser = Parser::new_with_recovery_config("my $x = 1;", ());
337    /// assert_eq!(parser.errors().len(), 0);
338    /// ```
339    pub fn new_with_recovery_config(input: &'a str, _config: ()) -> Self {
340        Parser::new(input)
341    }
342
343    /// Parse the source and return the AST for the Parse stage.
344    ///
345    /// # Returns
346    ///
347    /// * `Ok(Node)` - Parsed AST with a `Program` root node.
348    /// * `Err(ParseError)` - Non-recoverable parsing failure.
349    ///
350    /// # Errors
351    ///
352    /// Returns `ParseError` for non-recoverable conditions such as recursion limits.
353    ///
354    /// # Examples
355    ///
356    /// ```rust
357    /// use perl_parser_core::Parser;
358    ///
359    /// let mut parser = Parser::new("my $count = 1;");
360    /// let ast = parser.parse()?;
361    /// assert!(matches!(ast.kind, perl_parser_core::NodeKind::Program { .. }));
362    /// # Ok::<(), perl_parser_core::ParseError>(())
363    /// ```
364    pub fn parse(&mut self) -> ParseResult<Node> {
365        // Check cancellation before starting — handles pre-set flags immediately.
366        if let Some(ref flag) = self.cancellation_flag {
367            if flag.load(Ordering::Relaxed) {
368                return Err(ParseError::Cancelled);
369            }
370        }
371        self.parse_program()
372    }
373
374    /// Get all parse errors collected during parsing
375    ///
376    /// When error recovery is enabled, the parser continues after syntax errors
377    /// and collects them for later retrieval. This is useful for IDE integration
378    /// where you want to show all errors at once.
379    ///
380    /// # Returns
381    ///
382    /// A slice of all `ParseError`s encountered during parsing
383    ///
384    /// # Examples
385    ///
386    /// ```rust
387    /// use perl_parser_core::Parser;
388    ///
389    /// let mut parser = Parser::new("my $x = ; sub foo {");
390    /// let _ast = parser.parse(); // Parse with recovery
391    /// let errors = parser.errors();
392    /// // errors will contain details about syntax errors
393    /// ```
394    pub fn errors(&self) -> &[ParseError] {
395        &self.errors
396    }
397
398    /// Parse with error recovery and return comprehensive output.
399    ///
400    /// This method is preferred for LSP Analyze workflows and always returns
401    /// a `ParseOutput` containing the AST and any collected diagnostics.
402    ///
403    /// # Returns
404    ///
405    /// `ParseOutput` with the AST and diagnostics collected during parsing.
406    ///
407    /// # Examples
408    ///
409    /// ```rust
410    /// use perl_parser_core::Parser;
411    ///
412    /// let mut parser = Parser::new("my $x = ;");
413    /// let output = parser.parse_with_recovery();
414    /// assert!(!output.diagnostics.is_empty() || matches!(output.ast.kind, perl_parser_core::NodeKind::Program { .. }));
415    /// ```
416    pub fn parse_with_recovery(&mut self) -> ParseOutput {
417        let ast = match self.parse() {
418            Ok(node) => node,
419            Err(e) => {
420                // If parse() returned Err, it was a non-recoverable error (e.g. recursion limit)
421                // Ensure it's recorded if not already
422                if !self.errors.contains(&e) {
423                    self.errors.push(e.clone());
424                }
425
426                // Return a dummy Program node with the error
427                Node::new(
428                    NodeKind::Program { statements: vec![] },
429                    SourceLocation { start: 0, end: 0 },
430                )
431            }
432        };
433
434        ParseOutput::with_errors(ast, self.errors.clone())
435    }
436}
437
438include!("helpers.rs");
439include!("heredoc.rs");
440include!("statements.rs");
441include!("variables.rs");
442include!("control_flow.rs");
443include!("declarations.rs");
444include!("expressions/mod.rs");
445include!("expressions/precedence.rs");
446include!("expressions/unary.rs");
447include!("expressions/postfix.rs");
448include!("expressions/primary.rs");
449include!("expressions/calls.rs");
450include!("expressions/hashes.rs");
451include!("expressions/quotes.rs");
452
453#[cfg(test)]
454mod builtin_block_list_tests;
455#[cfg(test)]
456mod builtin_expansion_tests;
457#[cfg(test)]
458mod chained_deref_method_tests;
459#[cfg(test)]
460mod coderef_invocation_tests;
461#[cfg(test)]
462mod complex_args_tests;
463#[cfg(test)]
464mod control_flow_expr_tests;
465#[cfg(test)]
466mod declaration_in_args_tests;
467#[cfg(test)]
468mod error_recovery_tests;
469#[cfg(test)]
470mod eval_goto_tests;
471#[cfg(test)]
472mod for_builtin_block_tests;
473#[cfg(test)]
474mod format_comprehensive_tests;
475#[cfg(test)]
476mod format_tests;
477#[cfg(test)]
478mod forward_declaration_tests;
479#[cfg(test)]
480mod from_tokens_tests;
481#[cfg(test)]
482mod glob_assignment_tests;
483#[cfg(test)]
484mod glob_tests;
485#[cfg(test)]
486mod hash_vs_block_tests;
487#[cfg(test)]
488mod heredoc_security_tests;
489#[cfg(test)]
490mod indirect_call_tests;
491#[cfg(test)]
492mod indirect_object_tests;
493#[cfg(test)]
494mod loop_control_tests;
495#[cfg(test)]
496mod qualified_variable_subscript_tests;
497#[cfg(test)]
498mod regex_delimiter_tests;
499#[cfg(test)]
500mod slash_ambiguity_tests;
501#[cfg(test)]
502mod statement_modifier_tests;
503#[cfg(test)]
504mod tests;
505#[cfg(test)]
506mod tie_tests;
507#[cfg(test)]
508mod typed_variable_declaration_tests;
509#[cfg(test)]
510mod unclosed_block_recovery_tests;
511#[cfg(test)]
512mod use_overload_tests;
513#[cfg(test)]
514mod x_repetition_tests;
515
516#[cfg(test)]
517mod strip_qw_comments_unit_tests {
518    use super::strip_qw_comments;
519
520    #[test]
521    fn test_strip_basic() {
522        let result = strip_qw_comments("foo # comment\n bar");
523        assert_eq!(result.split_whitespace().collect::<Vec<_>>(), vec!["foo", "bar"]);
524    }
525}