Skip to main content

perl_parser_core/engine/
parser_context.rs

1//! Parser context with error recovery support
2//!
3//! This module provides a parsing context that tracks errors, positions,
4//! and supports error recovery for IDE scenarios.
5
6use crate::{
7    error::{BudgetTracker, ParseBudget},
8    error_recovery::ParseError,
9    position::{Position, Range},
10    token_wrapper::TokenWithPosition,
11};
12use perl_ast_v2::NodeIdGenerator;
13use perl_lexer::TokenType;
14use perl_position_tracking::LineStartsCache;
15use std::collections::VecDeque;
16
17/// Parser context with error tracking and recovery
18pub struct ParserContext {
19    /// Token stream with positions
20    tokens: VecDeque<TokenWithPosition>,
21    /// Current token index
22    current: usize,
23    /// Node ID generator
24    pub id_generator: NodeIdGenerator,
25    /// Accumulated parse errors
26    errors: Vec<ParseError>,
27    /// Source text
28    source: String,
29    /// Position tracker for efficient position mapping
30    _position_tracker: PositionTracker,
31    /// Budget limits for this parse
32    budget: ParseBudget,
33    /// Budget consumption tracker
34    budget_tracker: BudgetTracker,
35}
36
37/// Efficient position tracking using line starts cache
38///
39/// This implementation leverages the existing LineStartsCache for O(log n) position lookups
40/// instead of O(n) character-by-character advancement. It provides UTF-16 aware position
41/// mapping for LSP compatibility while integrating with the existing position infrastructure.
42struct PositionTracker {
43    /// Cache for O(log n) position lookups
44    line_cache: LineStartsCache,
45    /// Source text reference
46    source: String,
47}
48
49impl PositionTracker {
50    fn new(source: String) -> Self {
51        let line_cache = LineStartsCache::new(&source);
52        PositionTracker { line_cache, source }
53    }
54
55    /// Convert byte offset to position with UTF-16 support
56    fn byte_to_position(&self, byte_offset: usize) -> Position {
57        let (line, character) = self.line_cache.offset_to_position(&self.source, byte_offset);
58        // LineStartsCache returns 0-based line numbers, but Position expects 1-based
59        Position::new(byte_offset, line + 1, character + 1)
60    }
61}
62
63impl ParserContext {
64    /// Create a new parser context
65    pub fn new(source: String) -> Self {
66        let mut tokens = VecDeque::new();
67        let position_tracker = PositionTracker::new(source.clone());
68
69        // Tokenize the source using mode-aware lexer
70        let mut lexer = perl_lexer::PerlLexer::new(&source);
71        loop {
72            match lexer.next_token() {
73                Some(token) => {
74                    // Skip EOF tokens to avoid infinite loop
75                    if matches!(token.token_type, TokenType::EOF) {
76                        break;
77                    }
78
79                    let start = token.start;
80                    let end = token.end;
81
82                    // Use efficient position mapping with UTF-16 support
83                    let start_pos = position_tracker.byte_to_position(start);
84                    let end_pos = position_tracker.byte_to_position(end);
85
86                    tokens.push_back(TokenWithPosition::new(token, start_pos, end_pos));
87                }
88                None => break,
89            }
90        }
91
92        ParserContext {
93            tokens,
94            current: 0,
95            id_generator: NodeIdGenerator::new(),
96            errors: Vec::new(),
97            source,
98            _position_tracker: position_tracker,
99            budget: ParseBudget::default(),
100            budget_tracker: BudgetTracker::new(),
101        }
102    }
103
104    /// Create a new parser context with a custom budget.
105    pub fn with_budget(source: String, budget: ParseBudget) -> Self {
106        let mut ctx = Self::new(source);
107        ctx.budget = budget;
108        ctx
109    }
110
111    /// Get the current budget.
112    pub fn budget(&self) -> &ParseBudget {
113        &self.budget
114    }
115
116    /// Get the budget tracker.
117    pub fn budget_tracker(&self) -> &BudgetTracker {
118        &self.budget_tracker
119    }
120
121    /// Get mutable access to the budget tracker.
122    pub fn budget_tracker_mut(&mut self) -> &mut BudgetTracker {
123        &mut self.budget_tracker
124    }
125
126    /// Check if error budget is exhausted.
127    pub fn errors_exhausted(&self) -> bool {
128        self.budget_tracker.errors_exhausted(&self.budget)
129    }
130
131    /// Check if depth budget would be exceeded.
132    pub fn depth_would_exceed(&self) -> bool {
133        self.budget_tracker.depth_would_exceed(&self.budget)
134    }
135
136    /// Enter a nesting level, tracking depth.
137    pub fn enter_depth(&mut self) -> bool {
138        if self.depth_would_exceed() {
139            return false;
140        }
141        self.budget_tracker.enter_depth();
142        true
143    }
144
145    /// Exit a nesting level.
146    pub fn exit_depth(&mut self) {
147        self.budget_tracker.exit_depth();
148    }
149
150    /// Get current token
151    pub fn current_token(&self) -> Option<&TokenWithPosition> {
152        self.tokens.get(self.current)
153    }
154
155    /// Peek at next token
156    pub fn peek_token(&self, offset: usize) -> Option<&TokenWithPosition> {
157        self.tokens.get(self.current + offset)
158    }
159
160    /// Advance to next token
161    pub fn advance(&mut self) -> Option<&TokenWithPosition> {
162        if self.current < self.tokens.len() {
163            self.current += 1;
164        }
165        self.current_token()
166    }
167
168    /// Check if at end of tokens
169    pub fn is_eof(&self) -> bool {
170        self.current >= self.tokens.len()
171    }
172
173    /// Get current position
174    pub fn current_position(&self) -> Position {
175        if let Some(token) = self.current_token() {
176            token.range().start
177        } else if let Some(last_token) = self.tokens.back() {
178            // At EOF, use end of last token
179            last_token.range().end
180        } else {
181            Position::new(0, 1, 1)
182        }
183    }
184
185    /// Get current position range
186    pub fn current_position_range(&self) -> Range {
187        if let Some(token) = self.current_token() {
188            token.range()
189        } else {
190            let pos = self.current_position();
191            Range::new(pos, pos)
192        }
193    }
194
195    /// Add a parse error, tracking budget consumption.
196    ///
197    /// Returns `true` if the error was added, `false` if error budget exhausted.
198    pub fn add_error(&mut self, error: ParseError) -> bool {
199        if self.errors_exhausted() {
200            return false;
201        }
202        self.errors.push(error);
203        self.budget_tracker.record_error();
204        true
205    }
206
207    /// Add a parse error without checking budget (for critical errors).
208    pub fn add_error_unchecked(&mut self, error: ParseError) {
209        self.errors.push(error);
210        self.budget_tracker.record_error();
211    }
212
213    /// Get all accumulated errors
214    pub fn take_errors(&mut self) -> Vec<ParseError> {
215        std::mem::take(&mut self.errors)
216    }
217
218    /// Get current token index (for saving/restoring)
219    pub fn current_index(&self) -> usize {
220        self.current
221    }
222
223    /// Set current token index (for restoring)
224    pub fn set_index(&mut self, index: usize) {
225        self.current = index.min(self.tokens.len());
226    }
227
228    /// Match and consume a specific token type
229    pub fn expect(&mut self, expected: TokenType) -> Result<&TokenWithPosition, ParseError> {
230        match self.current_token() {
231            Some(token) if token.token.token_type == expected => {
232                self.advance();
233                Ok(&self.tokens[self.current - 1])
234            }
235            Some(token) => Err(ParseError::new(
236                format!("Expected {:?}, found {:?}", expected, token.token.token_type),
237                token.range(),
238            )
239            .with_expected(vec![format!("{:?}", expected)])
240            .with_found(format!("{:?}", token.token.token_type))),
241            None => Err(ParseError::new(
242                format!("Expected {:?}, found end of file", expected),
243                self.current_position_range(),
244            )
245            .with_expected(vec![format!("{:?}", expected)])
246            .with_found("EOF".to_string())),
247        }
248    }
249
250    /// Check if current token matches
251    pub fn check(&self, token_type: &TokenType) -> bool {
252        self.current_token().map(|t| &t.token.token_type == token_type).unwrap_or(false)
253    }
254
255    /// Consume token if it matches
256    pub fn consume(&mut self, token_type: &TokenType) -> bool {
257        if self.check(token_type) {
258            self.advance();
259            true
260        } else {
261            false
262        }
263    }
264
265    /// Get source slice for a range
266    pub fn source_slice(&self, range: &Range) -> &str {
267        &self.source[range.start.byte..range.end.byte]
268    }
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274    use perl_tdd_support::must_some;
275
276    #[test]
277    fn test_parser_context_creation() {
278        let source = "my $x = 42;".to_string();
279        let ctx = ParserContext::new(source);
280
281        assert!(!ctx.is_eof());
282        assert!(!ctx.tokens.is_empty());
283    }
284
285    #[test]
286    fn test_token_advancement() {
287        let source = "my $x".to_string();
288        let mut ctx = ParserContext::new(source);
289
290        // First token should be 'my'
291        assert!(matches!(
292            ctx.current_token().map(|t| &t.token.token_type),
293            Some(TokenType::Keyword(k)) if k.as_ref() == "my"
294        ));
295
296        // Advance to next token
297        ctx.advance();
298        assert!(ctx.current_token().is_some());
299    }
300
301    #[test]
302    fn test_error_accumulation() {
303        let mut ctx = ParserContext::new("test".to_string());
304
305        let error1 = ParseError::new("Error 1".to_string(), ctx.current_position_range());
306        let error2 = ParseError::new("Error 2".to_string(), ctx.current_position_range());
307
308        ctx.add_error(error1);
309        ctx.add_error(error2);
310
311        let errors = ctx.take_errors();
312        assert_eq!(errors.len(), 2);
313        assert_eq!(errors[0].message, "Error 1");
314        assert_eq!(errors[1].message, "Error 2");
315    }
316
317    #[test]
318    fn test_multiline_positions() {
319        let source = "my $x = 42;\nmy $y = 43;".to_string();
320        let ctx = ParserContext::new(source.clone());
321
322        let first_offset = must_some(source.find("my"));
323        let second_offset = must_some(source.rfind("my"));
324
325        let first = must_some(ctx.tokens.iter().find(|t| t.range().start.byte == first_offset));
326        assert_eq!(first.range().start.line, 1);
327        assert_eq!(first.range().start.column, 1);
328        assert_eq!(first.range().end.line, 1);
329        assert_eq!(first.range().end.column, 3);
330
331        let second = must_some(ctx.tokens.iter().find(|t| t.range().start.byte == second_offset));
332        assert_eq!(second.range().start.line, 2);
333        assert_eq!(second.range().start.column, 1);
334        assert_eq!(second.range().end.line, 2);
335        assert_eq!(second.range().end.column, 3);
336    }
337
338    #[test]
339    fn test_multiline_string_token_positions() {
340        let source = "my $s = \"a\nb\";".to_string();
341        let ctx = ParserContext::new(source.clone());
342
343        let string_offset = must_some(source.find('"'));
344        let token = must_some(ctx.tokens.iter().find(|t| t.range().start.byte == string_offset));
345
346        assert_eq!(token.range().start.line, 1);
347        assert_eq!(token.range().start.column, 9);
348        assert_eq!(token.range().end.line, 2);
349        assert_eq!(token.range().end.column, 3);
350    }
351
352    #[test]
353    fn test_utf16_position_mapping() {
354        // Test with emoji which takes 2 UTF-16 code units
355        let source = "my $emoji = 😀;".to_string();
356        let ctx = ParserContext::new(source.clone());
357
358        // Find the emoji token (if lexer produces it as separate token)
359        // For now, test that positions are computed correctly for the = token
360        let equals_offset = must_some(source.find('='));
361        let equals_token =
362            must_some(ctx.tokens.iter().find(|t| t.range().start.byte == equals_offset));
363
364        // Before emoji: "my $emoji "  = 10 characters but the emoji counts as 2 UTF-16 units
365        // So column should account for UTF-16 encoding
366        assert_eq!(equals_token.range().start.line, 1);
367        // The exact column depends on how the lexer tokenizes, but should be UTF-16 aware
368        assert!(equals_token.range().start.column > 0);
369    }
370
371    #[test]
372    fn test_crlf_line_endings() {
373        let source = "my $x = 42;\r\nmy $y = 43;".to_string();
374        let ctx = ParserContext::new(source.clone());
375
376        let first_offset = must_some(source.find("my"));
377        let second_offset = must_some(source.rfind("my"));
378
379        let first = must_some(ctx.tokens.iter().find(|t| t.range().start.byte == first_offset));
380        assert_eq!(first.range().start.line, 1);
381        assert_eq!(first.range().start.column, 1);
382
383        let second = must_some(ctx.tokens.iter().find(|t| t.range().start.byte == second_offset));
384        assert_eq!(second.range().start.line, 2);
385        assert_eq!(second.range().start.column, 1);
386    }
387
388    #[test]
389    fn test_empty_source() {
390        let source = "".to_string();
391        let ctx = ParserContext::new(source);
392
393        assert!(ctx.tokens.is_empty());
394        assert!(ctx.is_eof());
395    }
396
397    #[test]
398    fn test_single_token() {
399        let source = "42".to_string();
400        let ctx = ParserContext::new(source);
401
402        assert_eq!(ctx.tokens.len(), 1);
403
404        let token = &ctx.tokens[0];
405        assert_eq!(token.range().start.byte, 0);
406        assert_eq!(token.range().start.line, 1);
407        assert_eq!(token.range().start.column, 1);
408        assert_eq!(token.range().end.byte, 2);
409        assert_eq!(token.range().end.line, 1);
410        assert_eq!(token.range().end.column, 3);
411    }
412}