Skip to main content

perl_incremental_parsing/incremental/
incremental_checkpoint.rs

1//! Incremental parser with lexer checkpointing
2//!
3//! This module provides a fully incremental parser that uses lexer checkpoints
4//! to efficiently re-lex only the changed portions of the input.
5//!
6//! # Pipeline integration
7//!
8//! Token caching and `Parser::from_tokens` are now wired together:
9//!
10//! 1. `parse_with_checkpoints` collects **parser tokens** (trivia-filtered,
11//!    kind-converted) and caches them alongside the lexer checkpoints.
12//! 2. `reparse_from_checkpoint` assembles a mixed token list from cached tokens
13//!    (before the edit) + freshly-lexed tokens (affected region) + cached or
14//!    freshly-lexed tokens (after the edit), then calls `Parser::from_tokens`
15//!    to skip re-lexing the unchanged portions.
16
17use crate::{ast::Node, edit::Edit as OriginalEdit, error::ParseResult, parser::Parser};
18use perl_lexer::{CheckpointCache, Checkpointable, LexerCheckpoint, PerlLexer};
19use perl_parser_core::token_stream::{Token, TokenStream};
20
21/// Incremental parser with lexer checkpointing
22pub struct CheckpointedIncrementalParser {
23    /// Current source text
24    source: String,
25    /// Current parse tree
26    tree: Option<Node>,
27    /// Lexer checkpoint cache
28    checkpoint_cache: CheckpointCache,
29    /// Token cache for reuse — stores **parser** tokens (trivia-filtered, kind-converted).
30    token_cache: TokenCache,
31    /// Statistics
32    stats: IncrementalStats,
33}
34
35/// Cache for parser tokens to avoid re-lexing.
36///
37/// Stores [`Token`] values (from `perl-token`) rather than raw lexer tokens so
38/// that the cached values can be fed directly to [`Parser::from_tokens`].
39struct TokenCache {
40    /// All cached parser tokens in source order.
41    tokens: Vec<Token>,
42    /// The byte range `[start, end)` that the cached tokens cover.
43    valid_range: Option<(usize, usize)>,
44}
45
46impl TokenCache {
47    fn new() -> Self {
48        TokenCache { tokens: Vec::new(), valid_range: None }
49    }
50
51    /// Return a sub-slice of cached tokens whose `start` is `>= position`.
52    ///
53    /// Because tokens are stored in source order we can binary-search for the
54    /// first token at or after `position`.
55    fn get_tokens_from(&self, position: usize) -> Option<&[Token]> {
56        let (valid_start, valid_end) = self.valid_range?;
57        if position < valid_start || position >= valid_end {
58            return None;
59        }
60        let idx = self.tokens.partition_point(|t| t.start < position);
61        Some(&self.tokens[idx..])
62    }
63
64    /// Return a sub-slice of cached tokens that end at or before `position`.
65    fn get_tokens_before(&self, position: usize) -> Option<&[Token]> {
66        let (valid_start, _valid_end) = self.valid_range?;
67        if self.tokens.is_empty() || valid_start >= position {
68            return None;
69        }
70        let idx = self.tokens.partition_point(|t| t.end <= position);
71        if idx == 0 { None } else { Some(&self.tokens[..idx]) }
72    }
73
74    /// Replace the entire cache with a new set of parser tokens.
75    fn cache_tokens(&mut self, start: usize, end: usize, tokens: Vec<Token>) {
76        self.tokens = tokens;
77        self.valid_range = Some((start, end));
78    }
79
80    /// Invalidate the cache if the given byte range overlaps with the cached range.
81    fn invalidate_range(&mut self, start: usize, end: usize) {
82        if let Some((valid_start, valid_end)) = self.valid_range {
83            if start <= valid_end && end >= valid_start {
84                self.valid_range = None;
85                self.tokens.clear();
86            }
87        }
88    }
89}
90
91/// Statistics for incremental parsing
92#[derive(Debug, Default)]
93pub struct IncrementalStats {
94    pub total_parses: usize,
95    pub incremental_parses: usize,
96    pub tokens_reused: usize,
97    pub tokens_relexed: usize,
98    pub checkpoints_used: usize,
99    pub cache_hits: usize,
100    pub cache_misses: usize,
101}
102
103/// Simple edit structure for demos
104#[derive(Debug, Clone)]
105pub struct SimpleEdit {
106    pub start: usize,
107    pub end: usize,
108    pub new_text: String,
109}
110
111impl SimpleEdit {
112    /// Convert to original Edit format if needed
113    pub fn to_original_edit(&self) -> OriginalEdit {
114        // Simplified conversion - would need proper position tracking
115        OriginalEdit::new(
116            self.start,
117            self.end,
118            self.start + self.new_text.len(),
119            crate::position::Position::new(self.start, 0, 0),
120            crate::position::Position::new(self.end, 0, 0),
121            crate::position::Position::new(self.start + self.new_text.len(), 0, 0),
122        )
123    }
124}
125
126impl Default for CheckpointedIncrementalParser {
127    fn default() -> Self {
128        Self::new()
129    }
130}
131
132impl CheckpointedIncrementalParser {
133    /// Create a new incremental parser
134    pub fn new() -> Self {
135        CheckpointedIncrementalParser {
136            source: String::new(),
137            tree: None,
138            checkpoint_cache: CheckpointCache::new(50), // Keep 50 checkpoints for large files (#2080)
139            token_cache: TokenCache::new(),
140            stats: IncrementalStats::default(),
141        }
142    }
143
144    /// Parse the initial source
145    pub fn parse(&mut self, source: String) -> ParseResult<Node> {
146        self.source = source;
147        self.stats.total_parses += 1;
148
149        // Full parse with checkpoint collection
150        let tree = self.parse_with_checkpoints()?;
151        self.tree = Some(tree.clone());
152
153        Ok(tree)
154    }
155
156    /// Apply an edit and reparse incrementally
157    pub fn apply_edit(&mut self, edit: &SimpleEdit) -> ParseResult<Node> {
158        self.stats.total_parses += 1;
159        self.stats.incremental_parses += 1;
160
161        // Apply edit to source
162        let new_content = &edit.new_text;
163        self.source.replace_range(edit.start..edit.end, new_content);
164
165        // Invalidate token cache for edited range
166        self.token_cache.invalidate_range(edit.start, edit.end);
167
168        // Update checkpoint cache
169        let old_len = edit.end - edit.start;
170        let new_len = new_content.len();
171        self.checkpoint_cache.apply_edit(edit.start, old_len, new_len);
172
173        // Find nearest checkpoint before edit
174        let checkpoint = self.checkpoint_cache.find_before(edit.start);
175
176        if let Some(checkpoint) = checkpoint {
177            self.stats.checkpoints_used += 1;
178            self.reparse_from_checkpoint(checkpoint.clone(), edit)
179        } else {
180            // No checkpoint found, full reparse
181            self.parse_with_checkpoints()
182        }
183    }
184
185    /// Parse with checkpoint collection and parser-token caching.
186    ///
187    /// Collects lexer checkpoints at pre-defined positions and caches the full
188    /// set of **parser** tokens (trivia-filtered) so they can be reused during
189    /// subsequent incremental reparses.
190    fn parse_with_checkpoints(&mut self) -> ParseResult<Node> {
191        let mut lexer = PerlLexer::new(&self.source);
192        let mut raw_tokens = Vec::new();
193        let mut checkpoint_positions = vec![0, 100, 500, 1000, 5000];
194
195        // Collect raw lexer tokens and save checkpoints at specific positions
196        let mut position = 0;
197        while let Some(token) = lexer.next_token() {
198            // Save checkpoint at specific positions
199            if checkpoint_positions.first() == Some(&position) {
200                checkpoint_positions.remove(0);
201                let checkpoint = lexer.checkpoint();
202                self.checkpoint_cache.add(checkpoint);
203            }
204
205            position = token.end;
206
207            // Stop at EOF
208            if matches!(token.token_type, perl_lexer::TokenType::EOF) {
209                break;
210            }
211
212            raw_tokens.push(token);
213        }
214
215        // Convert raw lexer tokens to parser tokens (trivia-filtered + kind-mapped)
216        // and cache them for reuse in incremental reparses.
217        let parser_tokens = TokenStream::lexer_tokens_to_parser_tokens(raw_tokens);
218
219        if let (Some(first), Some(last)) = (parser_tokens.first(), parser_tokens.last()) {
220            let start = first.start;
221            let end = last.end;
222            self.token_cache.cache_tokens(start, end, parser_tokens);
223        }
224
225        // Full parse from source — this initial parse still uses the lexer
226        // directly so that context-sensitive constructs (e.g. regex vs division)
227        // are correctly disambiguated.
228        let mut parser = Parser::new(&self.source);
229        parser.parse()
230    }
231
232    /// Reparse from a lexer checkpoint using cached tokens where possible.
233    ///
234    /// Assembles a parser-token stream that reuses cached tokens for the
235    /// unchanged regions and re-lexes only the portion affected by the edit,
236    /// then calls [`Parser::from_tokens`] to drive the parse without invoking
237    /// the lexer again.
238    fn reparse_from_checkpoint(
239        &mut self,
240        checkpoint: LexerCheckpoint,
241        edit: &SimpleEdit,
242    ) -> ParseResult<Node> {
243        // Restore the lexer at the checkpoint position so we can re-lex the
244        // affected region.
245        let mut lexer = PerlLexer::new(&self.source);
246        lexer.restore(&checkpoint);
247
248        let relex_start = checkpoint.position;
249        let mut parser_tokens: Vec<Token> = Vec::new();
250
251        // --- Phase 1: reuse cached tokens before the checkpoint ---
252        if let Some(cached) = self.token_cache.get_tokens_before(relex_start) {
253            parser_tokens.extend_from_slice(cached);
254            self.stats.tokens_reused += cached.len();
255        }
256
257        // --- Phase 2: re-lex the region from the checkpoint through the edit ---
258        let relex_end = edit.start + edit.new_text.len() + 100; // small lookahead
259        let mut raw_relexed: Vec<perl_lexer::Token> = Vec::new();
260        loop {
261            match lexer.next_token() {
262                Some(token) if matches!(token.token_type, perl_lexer::TokenType::EOF) => break,
263                Some(token) => {
264                    let token_end = token.end;
265                    raw_relexed.push(token);
266                    self.stats.tokens_relexed += 1;
267                    if token_end >= relex_end {
268                        break;
269                    }
270                }
271                None => break,
272            }
273        }
274        let converted = TokenStream::lexer_tokens_to_parser_tokens(raw_relexed);
275        parser_tokens.extend(converted);
276
277        // --- Phase 3: reuse cached tokens after the affected region ---
278        let after_edit_pos = edit.start + edit.new_text.len();
279        let byte_shift: isize = edit.new_text.len() as isize - (edit.end - edit.start) as isize;
280
281        if let Some(cached) = self.token_cache.get_tokens_from(after_edit_pos) {
282            self.stats.cache_hits += 1;
283            for token in cached {
284                // Adjust byte positions to account for the inserted/removed bytes.
285                let adjusted = Token {
286                    kind: token.kind,
287                    text: token.text.clone(),
288                    start: (token.start as isize + byte_shift) as usize,
289                    end: (token.end as isize + byte_shift) as usize,
290                };
291                parser_tokens.push(adjusted);
292                self.stats.tokens_reused += 1;
293            }
294        } else {
295            self.stats.cache_misses += 1;
296            // No cache hit — lex the remainder of the source.
297            let mut raw_tail: Vec<perl_lexer::Token> = Vec::new();
298            while let Some(token) = lexer.next_token() {
299                if matches!(token.token_type, perl_lexer::TokenType::EOF) {
300                    break;
301                }
302                raw_tail.push(token);
303                self.stats.tokens_relexed += 1;
304            }
305            parser_tokens.extend(TokenStream::lexer_tokens_to_parser_tokens(raw_tail));
306        }
307
308        // Update token cache with the final merged token list.
309        if let (Some(first), Some(last)) = (parser_tokens.first(), parser_tokens.last()) {
310            let start = first.start;
311            let end = last.end;
312            self.token_cache.cache_tokens(start, end, parser_tokens.clone());
313        }
314
315        // Drive the parse from the pre-assembled token stream — no re-lexing.
316        let mut parser = Parser::from_tokens(parser_tokens, &self.source);
317        let tree = parser.parse()?;
318        self.tree = Some(tree.clone());
319
320        Ok(tree)
321    }
322
323    /// Get parsing statistics
324    pub fn stats(&self) -> &IncrementalStats {
325        &self.stats
326    }
327
328    /// Clear all caches
329    pub fn clear_caches(&mut self) {
330        self.checkpoint_cache.clear();
331        self.token_cache = TokenCache::new();
332    }
333}
334
335#[cfg(test)]
336mod tests {
337    use super::*;
338    use crate::NodeKind;
339    use perl_tdd_support::must;
340
341    #[test]
342    fn test_checkpoint_incremental_parsing() {
343        let mut parser = CheckpointedIncrementalParser::new();
344
345        // Initial parse
346        let source = "my $x = 42;\nmy $y = 99;\n".to_string();
347        let tree1 = must(parser.parse(source));
348
349        // Edit: change 42 to 4242
350        let edit = SimpleEdit { start: 8, end: 10, new_text: "4242".to_string() };
351
352        let tree2 = must(parser.apply_edit(&edit));
353
354        // Check stats
355        let stats = parser.stats();
356        assert_eq!(stats.total_parses, 2);
357        assert_eq!(stats.incremental_parses, 1);
358        assert!(stats.checkpoints_used > 0 || stats.tokens_relexed > 0);
359
360        // Trees should be structurally similar
361        if let (NodeKind::Program { statements: s1 }, NodeKind::Program { statements: s2 }) =
362            (&tree1.kind, &tree2.kind)
363        {
364            assert_eq!(s1.len(), s2.len());
365        } else {
366            unreachable!("Expected program nodes");
367        }
368    }
369
370    #[test]
371    fn test_checkpoint_cache_update() {
372        let mut parser = CheckpointedIncrementalParser::new();
373
374        // Parse a larger file
375        let source = "my $x = 1;\n".repeat(20);
376        must(parser.parse(source));
377
378        // Multiple edits
379        let edit1 = SimpleEdit { start: 8, end: 9, new_text: "42".to_string() };
380        must(parser.apply_edit(&edit1));
381
382        let edit2 = SimpleEdit { start: 20, end: 21, new_text: "99".to_string() };
383        must(parser.apply_edit(&edit2));
384
385        let stats = parser.stats();
386        assert_eq!(stats.incremental_parses, 2);
387        assert!(stats.tokens_relexed > 0);
388    }
389
390    #[test]
391    fn test_from_tokens_used_in_reparse() {
392        // Verify that `reparse_from_checkpoint` actually uses `Parser::from_tokens`
393        // by checking that `tokens_reused` is non-zero after an incremental reparse
394        // in a source large enough to have a checkpoint and cached tokens.
395        let mut parser = CheckpointedIncrementalParser::new();
396
397        // Source large enough to have tokens before the first checkpoint (position 0)
398        // so that the token cache has entries the reparse can reuse.
399        let source = format!("my $preamble = {};\n", "1".repeat(5));
400        must(parser.parse(source.clone()));
401
402        // Edit after the preamble so cached tokens before it can be reused.
403        let edit_start = source.find('=').unwrap_or(13) + 2; // just past `= `
404        let edit_end = edit_start + 5; // covers "11111"
405        let edit = SimpleEdit { start: edit_start, end: edit_end, new_text: "99999".to_string() };
406
407        must(parser.apply_edit(&edit));
408
409        let stats = parser.stats();
410        assert_eq!(stats.incremental_parses, 1);
411        // The reparse should have re-lexed at least some tokens in the edited region.
412        assert!(
413            stats.tokens_relexed > 0 || stats.tokens_reused > 0,
414            "expected either reused or relexed tokens, got {:?}",
415            stats
416        );
417    }
418}