Skip to main content

perl_incremental_parsing/incremental/
mod.rs

1#![allow(missing_docs)]
2
3use anyhow::Result;
4use lsp_types::Diagnostic;
5use lsp_types::TextDocumentContentChangeEvent;
6pub use perl_line_index::LineIndex;
7use ropey::Rope;
8use std::ops::Range;
9
10use crate::ast::{Node, NodeKind, SourceLocation};
11use crate::parser::Parser;
12use perl_lexer::{LexerMode, PerlLexer, Token, TokenType};
13
14pub mod incremental_advanced_reuse;
15pub mod incremental_checkpoint;
16pub mod incremental_document;
17pub mod incremental_edit;
18pub mod incremental_handler_v2;
19pub mod incremental_integration;
20pub mod incremental_simple;
21pub mod incremental_v2;
22
23/// Stable restart points to avoid re-lexing the whole world
24#[derive(Clone, Copy, Debug)]
25pub struct LexCheckpoint {
26    pub byte: usize,
27    pub mode: LexerMode,
28    pub line: usize,
29    pub column: usize,
30}
31
32/// Scope information at a parse checkpoint
33#[derive(Clone, Debug, Default)]
34pub struct ScopeSnapshot {
35    pub package_name: String,
36    pub locals: Vec<String>,
37    pub our_vars: Vec<String>,
38    pub parent_isa: Vec<String>,
39}
40
41/// Parse checkpoint with scope context
42#[derive(Clone, Debug)]
43pub struct ParseCheckpoint {
44    pub byte: usize,
45    pub scope_snapshot: ScopeSnapshot,
46    pub node_id: usize, // ID of AST node at this point
47}
48
49/// Incremental parsing state
50#[derive(Clone)]
51pub struct IncrementalState {
52    pub rope: Rope,
53    pub line_index: LineIndex,
54    pub lex_checkpoints: Vec<LexCheckpoint>,
55    pub parse_checkpoints: Vec<ParseCheckpoint>,
56    pub ast: Node,
57    pub tokens: Vec<Token>,
58    pub source: String,
59}
60
61impl IncrementalState {
62    pub fn new(source: String) -> Self {
63        let rope = Rope::from_str(&source);
64        let line_index = LineIndex::new(&source);
65
66        // Parse the initial document
67        let mut parser = Parser::new(&source);
68        let ast = match parser.parse() {
69            Ok(ast) => ast,
70            Err(e) => Node::new(
71                NodeKind::Error {
72                    message: e.to_string(),
73                    expected: vec![],
74                    found: None,
75                    partial: None,
76                },
77                SourceLocation { start: 0, end: source.len() },
78            ),
79        };
80
81        // Get tokens from lexer
82        let mut lexer = PerlLexer::new(&source);
83        let mut tokens = Vec::new();
84        loop {
85            match lexer.next_token() {
86                Some(token) => {
87                    if token.token_type == TokenType::EOF {
88                        break;
89                    }
90                    tokens.push(token);
91                }
92                None => break,
93            }
94        }
95
96        // Create initial checkpoints
97        let lex_checkpoints = Self::create_lex_checkpoints(&tokens, &line_index);
98        let parse_checkpoints = Self::create_parse_checkpoints(&ast);
99
100        Self { rope, line_index, lex_checkpoints, parse_checkpoints, ast, tokens, source }
101    }
102
103    /// Create lexer checkpoints at safe boundaries
104    fn create_lex_checkpoints(tokens: &[Token], line_index: &LineIndex) -> Vec<LexCheckpoint> {
105        let mut checkpoints =
106            vec![LexCheckpoint { byte: 0, mode: LexerMode::ExpectTerm, line: 0, column: 0 }];
107
108        let mut mode = LexerMode::ExpectTerm;
109
110        for token in tokens {
111            // Update mode based on token
112            mode = match token.token_type {
113                TokenType::Semicolon | TokenType::LeftBrace | TokenType::RightBrace => {
114                    // Safe boundary - reset to ExpectTerm
115                    let (line, column) = line_index.byte_to_position(token.end);
116                    checkpoints.push(LexCheckpoint {
117                        byte: token.end,
118                        mode: LexerMode::ExpectTerm,
119                        line,
120                        column,
121                    });
122                    LexerMode::ExpectTerm
123                }
124                TokenType::Keyword(ref kw) if kw.as_ref() == "sub" || kw.as_ref() == "package" => {
125                    let (line, column) = line_index.byte_to_position(token.start);
126                    checkpoints.push(LexCheckpoint {
127                        byte: token.start,
128                        mode: LexerMode::ExpectTerm, // ExpectIdentifier not available
129                        line,
130                        column,
131                    });
132                    LexerMode::ExpectTerm // ExpectIdentifier not available
133                }
134                TokenType::Identifier(_) | TokenType::Number(_) | TokenType::StringLiteral => {
135                    LexerMode::ExpectOperator
136                }
137                TokenType::Operator(_) => LexerMode::ExpectTerm,
138                _ => mode,
139            };
140        }
141
142        checkpoints
143    }
144
145    /// Create parse checkpoints at statement boundaries
146    fn create_parse_checkpoints(ast: &Node) -> Vec<ParseCheckpoint> {
147        let mut checkpoints = vec![];
148        let mut scope = ScopeSnapshot::default();
149
150        Self::walk_ast_for_checkpoints(ast, &mut checkpoints, &mut scope, 0);
151        checkpoints
152    }
153
154    fn walk_ast_for_checkpoints(
155        node: &Node,
156        checkpoints: &mut Vec<ParseCheckpoint>,
157        scope: &mut ScopeSnapshot,
158        node_id: usize,
159    ) {
160        // Process current node
161        match &node.kind {
162            NodeKind::Package { name, .. } => {
163                scope.package_name = name.clone();
164                checkpoints.push(ParseCheckpoint {
165                    byte: node.location.start,
166                    scope_snapshot: scope.clone(),
167                    node_id,
168                });
169            }
170            NodeKind::Subroutine { .. } | NodeKind::Block { .. } => {
171                checkpoints.push(ParseCheckpoint {
172                    byte: node.location.start,
173                    scope_snapshot: scope.clone(),
174                    node_id,
175                });
176            }
177            NodeKind::VariableDeclaration { variable, .. } => {
178                // Extract variable name from the variable node
179                if let NodeKind::Variable { name, sigil, .. } = &variable.kind {
180                    // Include sigil for proper variable tracking
181                    scope.locals.push(format!("{}{}", sigil, name));
182                }
183            }
184            NodeKind::VariableListDeclaration { variables, .. } => {
185                // Handle list declarations like my ($x, $y, @z)
186                for var in variables {
187                    if let NodeKind::Variable { name, sigil, .. } = &var.kind {
188                        scope.locals.push(format!("{}{}", sigil, name));
189                    }
190                }
191            }
192            _ => {}
193        }
194
195        // Recurse into children based on node kind
196        match &node.kind {
197            NodeKind::Program { statements } => {
198                for (i, stmt) in statements.iter().enumerate() {
199                    let child_id = node_id.wrapping_mul(101).wrapping_add(i);
200                    Self::walk_ast_for_checkpoints(stmt, checkpoints, scope, child_id);
201                }
202            }
203            NodeKind::Block { statements } => {
204                // Enter new scope for blocks
205                let mut local_scope = scope.clone();
206                for (i, stmt) in statements.iter().enumerate() {
207                    let child_id = node_id.wrapping_mul(101).wrapping_add(i);
208                    Self::walk_ast_for_checkpoints(stmt, checkpoints, &mut local_scope, child_id);
209                }
210            }
211            NodeKind::Subroutine { body, .. } => {
212                // Subroutine body is a single node (Block), not Vec<Node>
213                let mut local_scope = scope.clone();
214                let child_id = node_id.wrapping_mul(101);
215                Self::walk_ast_for_checkpoints(body, checkpoints, &mut local_scope, child_id);
216            }
217            NodeKind::If { condition, then_branch, elsif_branches, else_branch } => {
218                let base_id = node_id.wrapping_mul(101);
219                Self::walk_ast_for_checkpoints(condition, checkpoints, scope, base_id);
220
221                // then_branch is Box<Node>, not Option<Box<Node>>
222                Self::walk_ast_for_checkpoints(
223                    then_branch,
224                    checkpoints,
225                    scope,
226                    base_id.wrapping_add(1),
227                );
228
229                // elsif_branches is Vec<(Box<Node>, Box<Node>)>
230                for (i, (elsif_cond, elsif_block)) in elsif_branches.iter().enumerate() {
231                    let elsif_base = base_id.wrapping_add((i + 2) * 2);
232                    Self::walk_ast_for_checkpoints(elsif_cond, checkpoints, scope, elsif_base);
233                    Self::walk_ast_for_checkpoints(
234                        elsif_block,
235                        checkpoints,
236                        scope,
237                        elsif_base.wrapping_add(1),
238                    );
239                }
240                if let Some(else_br) = else_branch {
241                    let else_id = base_id.wrapping_add((elsif_branches.len() + 2) * 2);
242                    Self::walk_ast_for_checkpoints(else_br, checkpoints, scope, else_id);
243                }
244            }
245            NodeKind::While { condition, body, .. } => {
246                let base_id = node_id.wrapping_mul(101);
247                Self::walk_ast_for_checkpoints(condition, checkpoints, scope, base_id);
248                // body is Box<Node>, not Option<Box<Node>>
249                Self::walk_ast_for_checkpoints(body, checkpoints, scope, base_id.wrapping_add(1));
250            }
251            NodeKind::For { init, condition, update, body, .. } => {
252                let base_id = node_id.wrapping_mul(101);
253                let mut offset = 0;
254                if let Some(init) = init {
255                    Self::walk_ast_for_checkpoints(
256                        init,
257                        checkpoints,
258                        scope,
259                        base_id.wrapping_add(offset),
260                    );
261                    offset += 1;
262                }
263                if let Some(cond) = condition {
264                    Self::walk_ast_for_checkpoints(
265                        cond,
266                        checkpoints,
267                        scope,
268                        base_id.wrapping_add(offset),
269                    );
270                    offset += 1;
271                }
272                if let Some(upd) = update {
273                    Self::walk_ast_for_checkpoints(
274                        upd,
275                        checkpoints,
276                        scope,
277                        base_id.wrapping_add(offset),
278                    );
279                    offset += 1;
280                }
281                // body is Box<Node>, not Option<Box<Node>>
282                Self::walk_ast_for_checkpoints(
283                    body,
284                    checkpoints,
285                    scope,
286                    base_id.wrapping_add(offset),
287                );
288            }
289            NodeKind::Binary { left, right, .. } => {
290                let base_id = node_id.wrapping_mul(101);
291                Self::walk_ast_for_checkpoints(left, checkpoints, scope, base_id);
292                Self::walk_ast_for_checkpoints(right, checkpoints, scope, base_id.wrapping_add(1));
293            }
294            NodeKind::Assignment { lhs, rhs, .. } => {
295                let base_id = node_id.wrapping_mul(101);
296                Self::walk_ast_for_checkpoints(lhs, checkpoints, scope, base_id);
297                Self::walk_ast_for_checkpoints(rhs, checkpoints, scope, base_id.wrapping_add(1));
298            }
299            NodeKind::VariableDeclaration { initializer, .. } => {
300                if let Some(init) = initializer {
301                    let child_id = node_id.wrapping_mul(101);
302                    Self::walk_ast_for_checkpoints(init, checkpoints, scope, child_id);
303                }
304            }
305            // Add more cases as needed
306            _ => {}
307        }
308    }
309
310    /// Find the best checkpoint before a given byte offset
311    pub fn find_lex_checkpoint(&self, byte: usize) -> Option<&LexCheckpoint> {
312        self.lex_checkpoints.iter().rev().find(|cp| cp.byte <= byte)
313    }
314
315    /// Find the best parse checkpoint before a given byte offset
316    pub fn find_parse_checkpoint(&self, byte: usize) -> Option<&ParseCheckpoint> {
317        self.parse_checkpoints.iter().rev().find(|cp| cp.byte <= byte)
318    }
319}
320
321/// Edit description
322#[derive(Clone, Debug)]
323pub struct Edit {
324    pub start_byte: usize,
325    pub old_end_byte: usize,
326    pub new_end_byte: usize,
327    pub new_text: String,
328}
329
330impl Edit {
331    /// Convert LSP change to Edit
332    pub fn from_lsp_change(
333        change: &TextDocumentContentChangeEvent,
334        line_index: &LineIndex,
335        old_text: &str,
336    ) -> Option<Self> {
337        if let Some(range) = change.range {
338            let start_byte = line_index
339                .position_to_byte(range.start.line as usize, range.start.character as usize)?;
340            let old_end_byte = line_index
341                .position_to_byte(range.end.line as usize, range.end.character as usize)?;
342            let new_end_byte = start_byte + change.text.len();
343
344            Some(Edit { start_byte, old_end_byte, new_end_byte, new_text: change.text.clone() })
345        } else {
346            // Full document change
347            Some(Edit {
348                start_byte: 0,
349                old_end_byte: old_text.len(),
350                new_end_byte: change.text.len(),
351                new_text: change.text.clone(),
352            })
353        }
354    }
355}
356
357/// Result of incremental reparse
358#[derive(Debug)]
359pub struct ReparseResult {
360    pub changed_ranges: Vec<Range<usize>>,
361    pub diagnostics: Vec<Diagnostic>,
362    pub reparsed_bytes: usize,
363}
364
365/// Apply edits incrementally
366pub fn apply_edits(state: &mut IncrementalState, edits: &[Edit]) -> Result<ReparseResult> {
367    // Handle multiple edits by sorting and applying in reverse order
368    let mut sorted_edits = edits.to_vec();
369    sorted_edits.sort_by_key(|e| e.start_byte);
370    sorted_edits.reverse(); // Apply from end to start to avoid offset shifts
371
372    // Check if we should fall back to full reparse
373    let total_changed = sorted_edits.iter().map(|e| e.new_text.len()).sum::<usize>();
374
375    // Fallback thresholds
376    const MAX_EDIT_SIZE: usize = 64 * 1024; // 64KB
377
378    if total_changed > MAX_EDIT_SIZE {
379        return full_reparse(state);
380    }
381
382    // For MVP, handle single edit with incremental logic
383    if sorted_edits.len() == 1 {
384        let edit = &sorted_edits[0];
385
386        // Heuristic: if edit is large (>1KB) or crosses many lines, do full reparse
387        if edit.new_text.len() > 1024 || edit.new_text.matches('\n').count() > 10 {
388            apply_single_edit(state, edit)?;
389            return full_reparse(state);
390        }
391
392        // Apply the edit with incremental lexing
393        let reparsed_range = apply_single_edit(state, edit)?;
394        let reparsed_bytes = reparsed_range.end - reparsed_range.start;
395
396        // If reparsed too much (>20% of doc), might need full parse in future
397        // But for now, trust the incremental result
398
399        Ok(ReparseResult {
400            changed_ranges: vec![reparsed_range],
401            diagnostics: vec![],
402            reparsed_bytes,
403        })
404    } else {
405        // Multiple edits - apply them in sequence
406        for edit in sorted_edits {
407            apply_single_edit(state, &edit)?;
408        }
409        full_reparse(state)
410    }
411}
412
413/// Apply a single edit to the state
414fn apply_single_edit(state: &mut IncrementalState, edit: &Edit) -> Result<Range<usize>> {
415    // Find checkpoint before edit to resume lexing
416    let checkpoint = state
417        .find_lex_checkpoint(edit.start_byte)
418        .copied()
419        .ok_or_else(|| anyhow::anyhow!("No checkpoint found"))?;
420
421    // Apply text edit with safe boundary checking
422    let old_end_byte = edit.old_end_byte.min(state.source.len());
423    let start_byte = edit.start_byte.min(state.source.len());
424
425    // Compute the byte shift caused by this edit
426    let byte_shift: isize = edit.new_text.len() as isize - (old_end_byte - start_byte) as isize;
427
428    let mut new_source = String::with_capacity(
429        state.source.len() - (old_end_byte - start_byte) + edit.new_text.len(),
430    );
431    new_source.push_str(&state.source[..start_byte]);
432    new_source.push_str(&edit.new_text);
433    new_source.push_str(&state.source[old_end_byte..]);
434    state.source = new_source;
435    state.rope = Rope::from_str(&state.source);
436    state.line_index = LineIndex::new(&state.source);
437
438    // Start lexing from checkpoint
439    use perl_lexer::{Checkpointable, LexerCheckpoint, Position};
440    let mut lexer = PerlLexer::new(&state.source);
441    let mut lex_cp = LexerCheckpoint::new();
442    lex_cp.position = checkpoint.byte;
443    lex_cp.mode = checkpoint.mode;
444    lex_cp.current_pos = Position {
445        byte: checkpoint.byte,
446        line: (checkpoint.line + 1) as u32,
447        column: (checkpoint.column + 1) as u32,
448    };
449    lexer.restore(&lex_cp);
450
451    // Determine token index to splice
452    let start_idx =
453        state.tokens.iter().position(|t| t.start >= checkpoint.byte).unwrap_or(state.tokens.len());
454
455    // Build a lookup of old tokens past the edit for synchronisation.
456    // Once a newly-lexed token matches an old token (shifted by the byte delta),
457    // we can stop re-lexing and reuse the remaining old tokens with adjusted positions.
458    let edit_end_in_new = start_byte + edit.new_text.len();
459
460    // Find the first old-token index whose start >= old_end_byte (i.e. fully past the edit)
461    let old_sync_start =
462        state.tokens.iter().position(|t| t.start >= old_end_byte).unwrap_or(state.tokens.len());
463
464    let mut new_tokens = Vec::new();
465    let mut last_token_end = checkpoint.byte;
466    let mut synced = false;
467    let mut sync_old_idx = state.tokens.len();
468    loop {
469        match lexer.next_token() {
470            Some(token) => {
471                if token.token_type == TokenType::EOF {
472                    break;
473                }
474                last_token_end = token.end;
475
476                // Once we are past the edit region, check if the new token matches
477                // an old token (shifted by byte_shift). If so, we can stop re-lexing
478                // and reuse the rest of the old tokens with position adjustment.
479                if token.start >= edit_end_in_new {
480                    let mut found_sync = false;
481                    for (idx_offset, old_tok) in state.tokens[old_sync_start..].iter().enumerate() {
482                        let shifted_start = (old_tok.start as isize + byte_shift) as usize;
483                        let shifted_end = (old_tok.end as isize + byte_shift) as usize;
484                        if token.start == shifted_start
485                            && token.end == shifted_end
486                            && token.token_type == old_tok.token_type
487                        {
488                            // Token synchronised -- reuse the rest
489                            found_sync = true;
490                            sync_old_idx = old_sync_start + idx_offset + 1;
491                            break;
492                        }
493                    }
494                    // Push the token regardless (it's valid either way)
495                    new_tokens.push(token);
496                    if found_sync {
497                        synced = true;
498                        break;
499                    }
500                } else {
501                    new_tokens.push(token);
502                }
503            }
504            None => break,
505        }
506    }
507
508    // If we synchronised, append remaining old tokens with shifted positions
509    if synced {
510        for old_tok in &state.tokens[sync_old_idx..] {
511            let mut adjusted = old_tok.clone();
512            adjusted.start = (adjusted.start as isize + byte_shift) as usize;
513            adjusted.end = (adjusted.end as isize + byte_shift) as usize;
514            last_token_end = adjusted.end;
515            new_tokens.push(adjusted);
516        }
517    }
518
519    state.tokens.splice(start_idx.., new_tokens);
520
521    // Rebuild checkpoints with updated line index
522    state.lex_checkpoints =
523        IncrementalState::create_lex_checkpoints(&state.tokens, &state.line_index);
524
525    // Return the actual reparsed range (from checkpoint to end of last new token)
526    Ok(checkpoint.byte..last_token_end)
527}
528
529#[cfg(test)]
530mod tests {
531    use super::*;
532
533    /// Verify that a small ranged edit re-lexes substantially fewer bytes than
534    /// the total document length, confirming the checkpoint fast-path fires.
535    #[test]
536    fn test_incremental_state_small_edit_uses_checkpoint() -> Result<()> {
537        // Build a document large enough that a checkpoint will exist before the edit.
538        // We need at least one statement-boundary token (semicolon / brace) to create
539        // a checkpoint entry that sits before the edit site.
540        let mut lines = Vec::with_capacity(30);
541        for i in 0..30usize {
542            lines.push(format!("my $var_{i} = {i};"));
543        }
544        let source = lines.join("\n");
545        let doc_len = source.len();
546
547        let mut state = IncrementalState::new(source.clone());
548
549        // Sanity: we should have at least one lex checkpoint from all those semicolons.
550        assert!(
551            state.lex_checkpoints.len() > 1,
552            "expected multiple lex checkpoints, got {}",
553            state.lex_checkpoints.len()
554        );
555
556        // Edit the LAST line: change `my $var_29 = 29;` -> `my $var_29 = 999;`
557        // The checkpoint at some semicolon earlier in the doc should be used, meaning
558        // we re-lex far less than the full document.
559        let edit_text = "999";
560        let edit_start = source.rfind("29;").unwrap_or(source.len() - 3);
561        let edit_end = edit_start + 2; // replace "29"
562
563        let edit = Edit {
564            start_byte: edit_start,
565            old_end_byte: edit_end,
566            new_end_byte: edit_start + edit_text.len(),
567            new_text: edit_text.to_string(),
568        };
569
570        let result = apply_edits(&mut state, &[edit])?;
571
572        // The key assertion: reparsed_bytes must be strictly less than the full
573        // document size, proving the incremental path was taken.
574        assert!(
575            result.reparsed_bytes < doc_len,
576            "incremental reparse should cover less than the full document ({} bytes reparsed, {} doc len)",
577            result.reparsed_bytes,
578            doc_len
579        );
580
581        // Source must reflect the edit.
582        assert!(state.source.contains("999"), "source must contain the new value after edit");
583        assert!(
584            !state.source.contains("= 29;"),
585            "source must not contain the old value after edit"
586        );
587
588        Ok(())
589    }
590
591    /// Verify that the full-reparse fallback is triggered for an edit >64KB.
592    #[test]
593    fn test_incremental_state_large_edit_falls_back_to_full_reparse() -> Result<()> {
594        let source = "my $x = 1;\n".repeat(10);
595        let mut state = IncrementalState::new(source.clone());
596
597        // A new_text larger than 64KB must trigger full reparse
598        let big_text = "x".repeat(65 * 1024);
599        let edit = Edit {
600            start_byte: 0,
601            old_end_byte: 0,
602            new_end_byte: big_text.len(),
603            new_text: big_text.clone(),
604        };
605
606        let result = apply_edits(&mut state, &[edit])?;
607
608        // Full reparse: reparsed_bytes == full (new) source length
609        assert_eq!(
610            result.reparsed_bytes,
611            state.source.len(),
612            "large edit must trigger full reparse, reparsed_bytes should equal doc length"
613        );
614
615        Ok(())
616    }
617}
618
619/// Full document reparse fallback
620fn full_reparse(state: &mut IncrementalState) -> Result<ReparseResult> {
621    let mut parser = Parser::new(&state.source);
622    state.ast = match parser.parse() {
623        Ok(ast) => ast,
624        Err(e) => Node::new(
625            NodeKind::Error {
626                message: e.to_string(),
627                expected: vec![],
628                found: None,
629                partial: None,
630            },
631            SourceLocation { start: 0, end: state.source.len() },
632        ),
633    };
634
635    // Re-lex to get tokens
636    let mut lexer = PerlLexer::new(&state.source);
637    let mut tokens = Vec::new();
638    loop {
639        match lexer.next_token() {
640            Some(token) => {
641                if token.token_type == TokenType::EOF {
642                    break;
643                }
644                tokens.push(token);
645            }
646            None => break,
647        }
648    }
649    state.tokens = tokens;
650
651    state.rope = Rope::from_str(&state.source);
652    state.line_index = LineIndex::new(&state.source);
653
654    // Rebuild checkpoints
655    state.lex_checkpoints =
656        IncrementalState::create_lex_checkpoints(&state.tokens, &state.line_index);
657    state.parse_checkpoints = IncrementalState::create_parse_checkpoints(&state.ast);
658
659    // No diagnostics for now, will be handled by the LSP server
660    let diagnostics = vec![];
661
662    Ok(ReparseResult {
663        changed_ranges: vec![0..state.source.len()],
664        diagnostics,
665        reparsed_bytes: state.source.len(),
666    })
667}