perl_parser/incremental/
incremental_document.rs

1//! High-performance incremental document parsing with subtree reuse
2//!
3//! This module provides incremental parsing with subtree reuse.  Single-edit
4//! fast-path updates target <1ms; batch edits (`apply_edits`) always perform
5//! a fresh parse for correctness validation, so batch latency scales with
6//! document size rather than edit size.
7
8use super::incremental_edit::{IncrementalEdit, IncrementalEditSet};
9use perl_parser_core::{
10    ast::{Node, NodeKind},
11    error::ParseResult,
12    parser::Parser,
13};
14use std::collections::{HashMap, VecDeque};
15use std::sync::Arc;
16use std::time::Instant;
17use tracing::debug;
18
19/// A document with incremental parsing and subtree reuse
20#[derive(Debug, Clone)]
21pub struct IncrementalDocument {
22    /// Current parsed tree
23    pub root: Arc<Node>,
24    /// Source text
25    pub source: String,
26    /// Version number for tracking changes
27    pub version: u64,
28    /// Cache of reusable subtrees
29    pub subtree_cache: SubtreeCache,
30    /// Performance metrics
31    pub metrics: ParseMetrics,
32}
33
34/// Cache for reusable subtrees
35#[derive(Debug, Clone, Default)]
36pub struct SubtreeCache {
37    /// Maps content hash to subtree for content-based reuse
38    pub by_content: HashMap<u64, Arc<Node>>,
39    /// Maps byte range to subtree for position-based reuse
40    pub by_range: HashMap<(usize, usize), Arc<Node>>,
41    /// LRU queue for cache eviction
42    pub lru: VecDeque<u64>,
43    /// Critical symbols that should be preserved longer
44    pub critical_symbols: HashMap<u64, SymbolPriority>,
45    /// Maximum cache size
46    pub max_size: usize,
47}
48
49/// Priority levels for symbols in cache eviction
50#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
51pub enum SymbolPriority {
52    Low = 0,
53    Medium = 1,
54    High = 2,
55    Critical = 3,
56}
57
58/// Performance metrics for incremental parsing
59#[derive(Debug, Clone, Default)]
60pub struct ParseMetrics {
61    pub last_parse_time_ms: f64,
62    pub nodes_reused: usize,
63    pub nodes_reparsed: usize,
64    pub cache_hits: usize,
65    pub cache_misses: usize,
66}
67
68impl IncrementalDocument {
69    /// Create a new incremental document
70    pub fn new(source: String) -> ParseResult<Self> {
71        let start = Instant::now();
72        let mut parser = Parser::new(&source);
73        let root = parser.parse()?;
74
75        let mut doc = IncrementalDocument {
76            root: Arc::new(root),
77            source,
78            version: 0,
79            subtree_cache: SubtreeCache::new(1000),
80            metrics: ParseMetrics::default(),
81        };
82
83        doc.metrics.last_parse_time_ms = start.elapsed().as_secs_f64() * 1000.0;
84        doc.cache_subtrees();
85
86        Ok(doc)
87    }
88
89    /// Apply an edit and incrementally reparse
90    pub fn apply_edit(&mut self, edit: IncrementalEdit) -> ParseResult<()> {
91        let start = Instant::now();
92        self.version += 1;
93
94        // Reset metrics for this parse cycle
95        self.metrics = ParseMetrics::default();
96
97        // Apply the edit to the source
98        let new_source = self.apply_edit_to_source(&edit);
99
100        // Find affected subtrees
101        let affected_range = (edit.start_byte, edit.old_end_byte);
102        let reusable_subtrees = self.find_reusable_subtrees(affected_range, &edit);
103
104        // Incrementally parse with subtree reuse
105        let (new_root, used_fast_path) =
106            self.incremental_parse(&new_source, &edit, reusable_subtrees)?;
107
108        // Update state
109        self.source = new_source;
110        self.root = Arc::new(new_root);
111
112        // Only rebuild the full subtree cache when the fast path was NOT used.
113        // The fast path only mutates a single token node in-place, so the
114        // existing cache entries (keyed by byte range) are mostly still valid,
115        // whereas a full cache rebuild walks the entire tree and is O(n).
116        if !used_fast_path {
117            self.cache_subtrees();
118        }
119
120        self.metrics.last_parse_time_ms = start.elapsed().as_secs_f64() * 1000.0;
121
122        Ok(())
123    }
124
125    /// Apply multiple edits in a batch
126    pub fn apply_edits(&mut self, edits: &IncrementalEditSet) -> ParseResult<()> {
127        let start = Instant::now();
128        self.version += 1;
129
130        // Reset metrics for this batch of edits
131        self.metrics = ParseMetrics::default();
132
133        let Some(sorted_edits) = edits.normalize_for_source(&self.source) else {
134            return self.fallback_parse_batch(edits, start);
135        };
136
137        // Apply all edits to source in-place.
138        // Reverse ordering keeps byte offsets stable while avoiding repeated
139        // full-string allocations for each edit.
140        let mut new_source = self.source.clone();
141        for edit in &sorted_edits {
142            if !self.apply_edit_in_place(&mut new_source, edit) {
143                return self.fallback_parse_batch(edits, start);
144            }
145        }
146
147        // Find all affected ranges
148        let affected_ranges: Vec<_> =
149            sorted_edits.iter().map(|e| (e.start_byte, e.old_end_byte)).collect();
150
151        // Collect reusable subtrees outside affected ranges
152        let reusable = self.find_reusable_for_ranges(&affected_ranges);
153
154        // Parse with reuse when possible. When reuse was attempted, validate
155        // the grafted tree against a fresh parse to catch divergence. When no
156        // reuse candidates exist, skip the second parse entirely.
157        let new_root = if !reusable.is_empty() {
158            let reused_root = self.parse_with_reuse(&new_source, reusable)?;
159            // parse_with_reuse already ran a full parse internally; run a second
160            // one only to verify the graft produced a consistent tree.
161            let mut parser = Parser::new(&new_source);
162            let fresh_root = parser.parse()?;
163            if Self::nodes_match(&reused_root, &fresh_root) {
164                reused_root
165            } else {
166                debug!("Batch incremental reuse diverged from fresh parse; using fallback result");
167                fresh_root
168            }
169        } else {
170            let mut parser = Parser::new(&new_source);
171            parser.parse()?
172        };
173
174        // Update state
175        self.source = new_source;
176        self.root = Arc::new(new_root);
177        self.cache_subtrees();
178
179        self.metrics.last_parse_time_ms = start.elapsed().as_secs_f64() * 1000.0;
180
181        Ok(())
182    }
183
184    fn fallback_parse_batch(
185        &mut self,
186        edits: &IncrementalEditSet,
187        start: Instant,
188    ) -> ParseResult<()> {
189        let new_source = edits.apply_to_string(&self.source);
190        let mut parser = Parser::new(&new_source);
191        let new_root = parser.parse()?;
192        self.source = new_source;
193        self.root = Arc::new(new_root);
194        self.cache_subtrees();
195        self.metrics.last_parse_time_ms = start.elapsed().as_secs_f64() * 1000.0;
196        Ok(())
197    }
198
199    /// Apply edit to source string
200    fn apply_edit_to_source(&self, edit: &IncrementalEdit) -> String {
201        self.apply_edit_to_string(&self.source, edit)
202    }
203
204    fn apply_edit_to_string(&self, source: &str, edit: &IncrementalEdit) -> String {
205        let Some((start, end)) = Self::map_edit_range(source, edit) else {
206            debug!(
207                "Invalid edit mapping: start={}, end={}, source_len={}",
208                edit.start_byte,
209                edit.old_end_byte,
210                source.len()
211            );
212            return source.to_string();
213        };
214
215        let mut result = String::with_capacity(source.len() + edit.new_text.len());
216        result.push_str(&source[..start]);
217        result.push_str(&edit.new_text);
218        result.push_str(&source[end..]);
219        result
220    }
221
222    fn apply_edit_in_place(&self, source: &mut String, edit: &IncrementalEdit) -> bool {
223        let Some((start, end)) = Self::map_edit_range(source, edit) else {
224            debug!(
225                "Invalid edit mapping: start={}, end={}, source_len={}",
226                edit.start_byte,
227                edit.old_end_byte,
228                source.len()
229            );
230            return false;
231        };
232
233        source.replace_range(start..end, &edit.new_text);
234        true
235    }
236
237    fn map_edit_range(source: &str, edit: &IncrementalEdit) -> Option<(usize, usize)> {
238        if edit.start_byte > edit.old_end_byte {
239            return None;
240        }
241        if edit.old_end_byte > source.len() {
242            return None;
243        }
244        if !source.is_char_boundary(edit.start_byte) || !source.is_char_boundary(edit.old_end_byte)
245        {
246            return None;
247        }
248        Some((edit.start_byte, edit.old_end_byte))
249    }
250
251    fn nodes_match(left: &Node, right: &Node) -> bool {
252        left == right
253    }
254
255    /// Find subtrees that can be reused (outside the edited range)
256    fn find_reusable_subtrees(
257        &mut self,
258        affected_range: (usize, usize),
259        edit: &IncrementalEdit,
260    ) -> Vec<Arc<Node>> {
261        let mut reusable = Vec::new();
262        let delta = edit.byte_shift();
263
264        // Collect subtrees before the edit (unchanged positions)
265        for ((start, end), node) in &self.subtree_cache.by_range {
266            if *end <= affected_range.0 {
267                // Subtree entirely before edit - can reuse as-is
268                reusable.push(node.clone());
269                self.metrics.cache_hits += 1;
270                self.metrics.nodes_reused += self.count_nodes(node);
271            } else if *start >= affected_range.1 {
272                // Subtree entirely after edit - needs position adjustment
273                if let Some(adjusted) = self.adjust_node_position(node, delta) {
274                    reusable.push(Arc::new(adjusted));
275                    self.metrics.cache_hits += 1;
276                    self.metrics.nodes_reused += self.count_nodes(node);
277                }
278            } else {
279                self.metrics.cache_misses += 1;
280            }
281        }
282
283        reusable
284    }
285
286    /// Find reusable subtrees for multiple affected ranges
287    fn find_reusable_for_ranges(&mut self, ranges: &[(usize, usize)]) -> Vec<Arc<Node>> {
288        let mut reusable = Vec::new();
289
290        for ((start, end), node) in &self.subtree_cache.by_range {
291            let affected = ranges.iter().any(|(r_start, r_end)| {
292                // Check if this subtree overlaps with any affected range
293                *start < *r_end && *end > *r_start
294            });
295
296            if !affected {
297                reusable.push(node.clone());
298                self.metrics.cache_hits += 1;
299                self.metrics.nodes_reused += self.count_nodes(node);
300            } else {
301                self.metrics.cache_misses += 1;
302            }
303        }
304
305        reusable
306    }
307
308    /// Incrementally parse with subtree reuse.
309    ///
310    /// Returns `(new_root, used_fast_path)` -- the boolean indicates whether
311    /// the fast token-update path was used so the caller can skip a full
312    /// cache rebuild.
313    fn incremental_parse(
314        &mut self,
315        source: &str,
316        edit: &IncrementalEdit,
317        _reusable: Vec<Arc<Node>>,
318    ) -> ParseResult<(Node, bool)> {
319        // For small edits within a single token, try fast path
320        if self.is_single_token_edit(edit) {
321            if let Some(node) = self.fast_token_update(source, edit) {
322                self.metrics.nodes_reparsed = 1;
323                return Ok((node, true));
324            }
325        }
326
327        // Otherwise use partial parsing with reuse
328        let node = self.parse_with_reuse(source, _reusable)?;
329        Ok((node, false))
330    }
331
332    /// Check if edit affects only a single token
333    fn is_single_token_edit(&self, edit: &IncrementalEdit) -> bool {
334        // Check if edit is small and contained within a single literal
335        if edit.old_end_byte - edit.start_byte > 100 {
336            return false; // Too large
337        }
338
339        // Find the containing node
340        if let Some(node) = self.find_node_at_position(edit.start_byte) {
341            matches!(
342                node.kind,
343                NodeKind::Number { .. } | NodeKind::String { .. } | NodeKind::Identifier { .. }
344            )
345        } else {
346            false
347        }
348    }
349
350    /// Fast path for single token updates
351    fn fast_token_update(&self, source: &str, edit: &IncrementalEdit) -> Option<Node> {
352        // Clone the tree and update just the affected token
353        let mut new_root = (*self.root).clone();
354
355        // Find and update the affected token
356        if self.update_token_in_tree(&mut new_root, source, edit) { Some(new_root) } else { None }
357    }
358
359    /// Update a single token in the tree
360    fn update_token_in_tree(&self, node: &mut Node, source: &str, edit: &IncrementalEdit) -> bool {
361        // Check if this node contains the edit
362        if node.location.start <= edit.start_byte && node.location.end >= edit.old_end_byte {
363            match &mut node.kind {
364                NodeKind::Number { .. } => {
365                    // Re-parse just this number
366                    let delta = edit.byte_shift();
367                    let new_end = (node.location.end as isize + delta).max(0) as usize;
368
369                    // Safely extract new text with bounds checking
370                    if new_end <= source.len()
371                        && source.is_char_boundary(node.location.start)
372                        && source.is_char_boundary(new_end)
373                    {
374                        node.location.end = new_end;
375                        let new_text = &source[node.location.start..node.location.end];
376                        if let Ok(value) = new_text.parse::<f64>() {
377                            node.kind = NodeKind::Number { value: value.to_string() };
378                            return true;
379                        }
380                    }
381                }
382                NodeKind::String { value, .. } => {
383                    // Update string content
384                    let delta = edit.byte_shift();
385                    let new_end = (node.location.end as isize + delta).max(0) as usize;
386
387                    // Safely extract new string value with bounds checking
388                    if new_end <= source.len()
389                        && source.is_char_boundary(node.location.start)
390                        && source.is_char_boundary(new_end)
391                    {
392                        node.location.end = new_end;
393                        let new_text = &source[node.location.start..node.location.end];
394                        *value = new_text.to_string();
395                        return true;
396                    }
397                }
398                NodeKind::Identifier { name } => {
399                    // Update identifier
400                    let delta = edit.byte_shift();
401                    let new_end = (node.location.end as isize + delta).max(0) as usize;
402
403                    // Safely extract identifier name with bounds checking
404                    if new_end <= source.len()
405                        && source.is_char_boundary(node.location.start)
406                        && source.is_char_boundary(new_end)
407                    {
408                        node.location.end = new_end;
409                        *name = source[node.location.start..node.location.end].to_string();
410                        return true;
411                    }
412                }
413                _ => {
414                    // Recursively search children
415                    return self.update_token_in_children(node, source, edit);
416                }
417            }
418        }
419
420        false
421    }
422
423    /// Update token in child nodes
424    fn update_token_in_children(
425        &self,
426        node: &mut Node,
427        source: &str,
428        edit: &IncrementalEdit,
429    ) -> bool {
430        match &mut node.kind {
431            NodeKind::Program { statements } | NodeKind::Block { statements } => {
432                for stmt in statements {
433                    if self.update_token_in_tree(stmt, source, edit) {
434                        return true;
435                    }
436                }
437            }
438            NodeKind::Binary { left, right, .. }
439            | NodeKind::Assignment { lhs: left, rhs: right, .. } => {
440                if self.update_token_in_tree(left, source, edit) {
441                    return true;
442                }
443                if self.update_token_in_tree(right, source, edit) {
444                    return true;
445                }
446            }
447            NodeKind::Subroutine { body, .. } => {
448                if self.update_token_in_tree(body, source, edit) {
449                    return true;
450                }
451            }
452            NodeKind::ExpressionStatement { expression } => {
453                if self.update_token_in_tree(expression, source, edit) {
454                    return true;
455                }
456            }
457            NodeKind::VariableDeclaration { variable, initializer, .. } => {
458                if self.update_token_in_tree(variable, source, edit) {
459                    return true;
460                }
461                if let Some(init) = initializer {
462                    if self.update_token_in_tree(init, source, edit) {
463                        return true;
464                    }
465                }
466            }
467            NodeKind::If { condition, then_branch, elsif_branches, else_branch } => {
468                if self.update_token_in_tree(condition, source, edit) {
469                    return true;
470                }
471                if self.update_token_in_tree(then_branch, source, edit) {
472                    return true;
473                }
474                for (cond, branch) in elsif_branches {
475                    if self.update_token_in_tree(cond, source, edit) {
476                        return true;
477                    }
478                    if self.update_token_in_tree(branch, source, edit) {
479                        return true;
480                    }
481                }
482                if let Some(else_b) = else_branch {
483                    if self.update_token_in_tree(else_b, source, edit) {
484                        return true;
485                    }
486                }
487            }
488            NodeKind::While { condition, body, .. } => {
489                if self.update_token_in_tree(condition, source, edit) {
490                    return true;
491                }
492                if self.update_token_in_tree(body, source, edit) {
493                    return true;
494                }
495            }
496            NodeKind::FunctionCall { args, .. } => {
497                for arg in args {
498                    if self.update_token_in_tree(arg, source, edit) {
499                        return true;
500                    }
501                }
502            }
503            NodeKind::Unary { operand, .. } => {
504                if self.update_token_in_tree(operand, source, edit) {
505                    return true;
506                }
507            }
508            _ => {}
509        }
510
511        false
512    }
513
514    /// Parse with reusable subtrees
515    fn parse_with_reuse(&mut self, source: &str, reusable: Vec<Arc<Node>>) -> ParseResult<Node> {
516        // Start with a fresh parse of the new source
517        let mut parser = Parser::new(source);
518        let mut root = parser.parse()?;
519
520        // Try to splice in any reusable subtrees by matching on byte ranges
521        for node in reusable {
522            self.insert_reusable(&mut root, &node);
523        }
524
525        // Update metrics based on reused nodes
526        self.metrics.nodes_reparsed =
527            self.count_nodes(&root).saturating_sub(self.metrics.nodes_reused);
528
529        Ok(root)
530    }
531
532    /// Replace matching nodes in `target` with a reusable subtree
533    fn insert_reusable(&self, target: &mut Node, reusable: &Arc<Node>) -> bool {
534        if target.location.start == reusable.location.start
535            && target.location.end == reusable.location.end
536            && std::mem::discriminant(&target.kind) == std::mem::discriminant(&reusable.kind)
537        {
538            *target = (**reusable).clone();
539            return true;
540        }
541
542        match &mut target.kind {
543            NodeKind::Program { statements } | NodeKind::Block { statements } => {
544                for stmt in statements {
545                    if self.insert_reusable(stmt, reusable) {
546                        return true;
547                    }
548                }
549            }
550            NodeKind::Binary { left, right, .. } => {
551                if self.insert_reusable(left, reusable) {
552                    return true;
553                }
554                if self.insert_reusable(right, reusable) {
555                    return true;
556                }
557            }
558            NodeKind::Subroutine { body, .. } => {
559                if self.insert_reusable(body, reusable) {
560                    return true;
561                }
562            }
563            NodeKind::ExpressionStatement { expression } => {
564                if self.insert_reusable(expression, reusable) {
565                    return true;
566                }
567            }
568            NodeKind::If { condition, then_branch, elsif_branches, else_branch } => {
569                if self.insert_reusable(condition, reusable) {
570                    return true;
571                }
572                if self.insert_reusable(then_branch, reusable) {
573                    return true;
574                }
575                for (cond, branch) in elsif_branches {
576                    if self.insert_reusable(cond, reusable) {
577                        return true;
578                    }
579                    if self.insert_reusable(branch, reusable) {
580                        return true;
581                    }
582                }
583                if let Some(else_b) = else_branch {
584                    if self.insert_reusable(else_b, reusable) {
585                        return true;
586                    }
587                }
588            }
589            NodeKind::While { condition, body, .. } => {
590                if self.insert_reusable(condition, reusable) {
591                    return true;
592                }
593                if self.insert_reusable(body, reusable) {
594                    return true;
595                }
596            }
597            NodeKind::For { init, condition, update, body, .. } => {
598                if let Some(i) = init {
599                    if self.insert_reusable(i, reusable) {
600                        return true;
601                    }
602                }
603                if let Some(c) = condition {
604                    if self.insert_reusable(c, reusable) {
605                        return true;
606                    }
607                }
608                if let Some(u) = update {
609                    if self.insert_reusable(u, reusable) {
610                        return true;
611                    }
612                }
613                if self.insert_reusable(body, reusable) {
614                    return true;
615                }
616            }
617            NodeKind::VariableDeclaration { variable, initializer, .. } => {
618                if self.insert_reusable(variable, reusable) {
619                    return true;
620                }
621                if let Some(init) = initializer {
622                    if self.insert_reusable(init, reusable) {
623                        return true;
624                    }
625                }
626            }
627            NodeKind::Assignment { lhs, rhs, .. } => {
628                if self.insert_reusable(lhs, reusable) {
629                    return true;
630                }
631                if self.insert_reusable(rhs, reusable) {
632                    return true;
633                }
634            }
635            _ => {}
636        }
637
638        false
639    }
640
641    /// Adjust node positions after an edit
642    ///
643    /// Returns `None` if the adjustment would produce an invalid (negative) position,
644    /// which can happen when a deletion shrinks the source and a node's original
645    /// position is inside the deleted range.
646    fn adjust_node_position(&self, node: &Node, delta: isize) -> Option<Node> {
647        let mut adjusted = node.clone();
648
649        // Checked conversion to prevent underflow when delta is negative
650        let new_start = adjusted.location.start as isize + delta;
651        let new_end = adjusted.location.end as isize + delta;
652        if new_start < 0 || new_end < 0 {
653            return None;
654        }
655        adjusted.location.start = new_start as usize;
656        adjusted.location.end = new_end as usize;
657
658        // Recursively adjust children
659        match &mut adjusted.kind {
660            NodeKind::Program { statements } | NodeKind::Block { statements } => {
661                for stmt in statements {
662                    *stmt = self.adjust_node_position(stmt, delta)?;
663                }
664            }
665            NodeKind::Binary { left, right, .. } => {
666                **left = self.adjust_node_position(left, delta)?;
667                **right = self.adjust_node_position(right, delta)?;
668            }
669            NodeKind::Subroutine { body, .. } => {
670                **body = self.adjust_node_position(body, delta)?;
671            }
672            NodeKind::ExpressionStatement { expression } => {
673                **expression = self.adjust_node_position(expression, delta)?;
674            }
675            NodeKind::If { condition, then_branch, elsif_branches, else_branch } => {
676                **condition = self.adjust_node_position(condition, delta)?;
677                **then_branch = self.adjust_node_position(then_branch, delta)?;
678                for (cond, branch) in elsif_branches {
679                    **cond = self.adjust_node_position(cond, delta)?;
680                    **branch = self.adjust_node_position(branch, delta)?;
681                }
682                if let Some(else_b) = else_branch {
683                    **else_b = self.adjust_node_position(else_b, delta)?;
684                }
685            }
686            NodeKind::While { condition, body, .. } => {
687                **condition = self.adjust_node_position(condition, delta)?;
688                **body = self.adjust_node_position(body, delta)?;
689            }
690            NodeKind::For { init, condition, update, body, .. } => {
691                if let Some(i) = init {
692                    **i = self.adjust_node_position(i, delta)?;
693                }
694                if let Some(c) = condition {
695                    **c = self.adjust_node_position(c, delta)?;
696                }
697                if let Some(u) = update {
698                    **u = self.adjust_node_position(u, delta)?;
699                }
700                **body = self.adjust_node_position(body, delta)?;
701            }
702            NodeKind::VariableDeclaration { variable, initializer, .. } => {
703                **variable = self.adjust_node_position(variable, delta)?;
704                if let Some(init) = initializer {
705                    **init = self.adjust_node_position(init, delta)?;
706                }
707            }
708            NodeKind::Assignment { lhs, rhs, .. } => {
709                **lhs = self.adjust_node_position(lhs, delta)?;
710                **rhs = self.adjust_node_position(rhs, delta)?;
711            }
712            _ => {}
713        }
714
715        Some(adjusted)
716    }
717
718    /// Find node at a specific byte position
719    fn find_node_at_position(&self, pos: usize) -> Option<&Node> {
720        self.find_in_node(&self.root, pos)
721    }
722
723    fn find_in_node<'a>(&self, node: &'a Node, pos: usize) -> Option<&'a Node> {
724        if node.location.start <= pos && node.location.end > pos {
725            // Check children for more specific match
726            match &node.kind {
727                NodeKind::Program { statements } | NodeKind::Block { statements } => {
728                    for stmt in statements {
729                        if let Some(found) = self.find_in_node(stmt, pos) {
730                            return Some(found);
731                        }
732                    }
733                }
734                NodeKind::Binary { left, right, .. }
735                | NodeKind::Assignment { lhs: left, rhs: right, .. } => {
736                    if let Some(found) = self.find_in_node(left, pos) {
737                        return Some(found);
738                    }
739                    if let Some(found) = self.find_in_node(right, pos) {
740                        return Some(found);
741                    }
742                }
743                NodeKind::Subroutine { body, .. } => {
744                    if let Some(found) = self.find_in_node(body, pos) {
745                        return Some(found);
746                    }
747                }
748                NodeKind::ExpressionStatement { expression } => {
749                    if let Some(found) = self.find_in_node(expression, pos) {
750                        return Some(found);
751                    }
752                }
753                NodeKind::VariableDeclaration { variable, initializer, .. } => {
754                    if let Some(found) = self.find_in_node(variable, pos) {
755                        return Some(found);
756                    }
757                    if let Some(init) = initializer {
758                        if let Some(found) = self.find_in_node(init, pos) {
759                            return Some(found);
760                        }
761                    }
762                }
763                NodeKind::If { condition, then_branch, elsif_branches, else_branch } => {
764                    if let Some(found) = self.find_in_node(condition, pos) {
765                        return Some(found);
766                    }
767                    if let Some(found) = self.find_in_node(then_branch, pos) {
768                        return Some(found);
769                    }
770                    for (cond, branch) in elsif_branches {
771                        if let Some(found) = self.find_in_node(cond, pos) {
772                            return Some(found);
773                        }
774                        if let Some(found) = self.find_in_node(branch, pos) {
775                            return Some(found);
776                        }
777                    }
778                    if let Some(else_b) = else_branch {
779                        if let Some(found) = self.find_in_node(else_b, pos) {
780                            return Some(found);
781                        }
782                    }
783                }
784                NodeKind::While { condition, body, .. } => {
785                    if let Some(found) = self.find_in_node(condition, pos) {
786                        return Some(found);
787                    }
788                    if let Some(found) = self.find_in_node(body, pos) {
789                        return Some(found);
790                    }
791                }
792                NodeKind::FunctionCall { args, .. } => {
793                    for arg in args {
794                        if let Some(found) = self.find_in_node(arg, pos) {
795                            return Some(found);
796                        }
797                    }
798                }
799                NodeKind::Unary { operand, .. } => {
800                    if let Some(found) = self.find_in_node(operand, pos) {
801                        return Some(found);
802                    }
803                }
804                _ => {}
805            }
806
807            // No more specific child, return this node
808            Some(node)
809        } else {
810            None
811        }
812    }
813
814    /// Cache subtrees for reuse
815    fn cache_subtrees(&mut self) {
816        self.subtree_cache.clear();
817        let root = self.root.clone();
818        self.cache_node(&root);
819    }
820
821    fn cache_node(&mut self, node: &Node) {
822        // Cache this subtree by range
823        let range = (node.location.start, node.location.end);
824        self.subtree_cache.by_range.insert(range, Arc::new(node.clone()));
825
826        // Cache by content hash for common patterns
827        let hash = self.hash_node(node);
828        let priority = self.get_symbol_priority(node);
829
830        self.subtree_cache.by_content.insert(hash, Arc::new(node.clone()));
831        self.subtree_cache.critical_symbols.insert(hash, priority);
832        self.subtree_cache.lru.push_back(hash);
833        self.subtree_cache.evict_if_needed();
834
835        // Recursively cache children
836        match &node.kind {
837            NodeKind::Program { statements } | NodeKind::Block { statements } => {
838                for stmt in statements {
839                    self.cache_node(stmt);
840                }
841            }
842            NodeKind::Binary { left, right, .. } => {
843                self.cache_node(left);
844                self.cache_node(right);
845            }
846            NodeKind::Subroutine { body, .. } => {
847                self.cache_node(body);
848            }
849            NodeKind::ExpressionStatement { expression } => {
850                self.cache_node(expression);
851            }
852            NodeKind::If { condition, then_branch, elsif_branches, else_branch } => {
853                self.cache_node(condition);
854                self.cache_node(then_branch);
855                for (cond, branch) in elsif_branches {
856                    self.cache_node(cond);
857                    self.cache_node(branch);
858                }
859                if let Some(else_b) = else_branch {
860                    self.cache_node(else_b);
861                }
862            }
863            NodeKind::While { condition, body, .. } => {
864                self.cache_node(condition);
865                self.cache_node(body);
866            }
867            NodeKind::For { init, condition, update, body, .. } => {
868                if let Some(i) = init {
869                    self.cache_node(i);
870                }
871                if let Some(c) = condition {
872                    self.cache_node(c);
873                }
874                if let Some(u) = update {
875                    self.cache_node(u);
876                }
877                self.cache_node(body);
878            }
879            NodeKind::Foreach { variable, list, body, continue_block } => {
880                self.cache_node(variable);
881                self.cache_node(list);
882                self.cache_node(body);
883                if let Some(cb) = continue_block {
884                    self.cache_node(cb);
885                }
886            }
887            NodeKind::VariableDeclaration { variable, initializer, .. } => {
888                self.cache_node(variable);
889                if let Some(init) = initializer {
890                    self.cache_node(init);
891                }
892            }
893            NodeKind::VariableListDeclaration { variables, initializer, .. } => {
894                for var in variables {
895                    self.cache_node(var);
896                }
897                if let Some(init) = initializer {
898                    self.cache_node(init);
899                }
900            }
901            NodeKind::Assignment { lhs, rhs, .. } => {
902                self.cache_node(lhs);
903                self.cache_node(rhs);
904            }
905            _ => {}
906        }
907    }
908
909    /// Generate hash for a node (for content-based caching)
910    fn hash_node(&self, node: &Node) -> u64 {
911        use std::collections::hash_map::DefaultHasher;
912        use std::hash::{Hash, Hasher};
913
914        let mut hasher = DefaultHasher::new();
915
916        // Hash node kind discriminant
917        std::mem::discriminant(&node.kind).hash(&mut hasher);
918
919        // Hash node content
920        match &node.kind {
921            NodeKind::Number { value } => value.hash(&mut hasher),
922            NodeKind::String { value, .. } => value.hash(&mut hasher),
923            NodeKind::Identifier { name } => name.hash(&mut hasher),
924            _ => {}
925        }
926
927        hasher.finish()
928    }
929
930    /// Count nodes in a subtree
931    fn count_nodes(&self, node: &Node) -> usize {
932        let mut count = 1;
933
934        match &node.kind {
935            NodeKind::Program { statements } | NodeKind::Block { statements } => {
936                for stmt in statements {
937                    count += self.count_nodes(stmt);
938                }
939            }
940            NodeKind::Binary { left, right, .. } => {
941                count += self.count_nodes(left);
942                count += self.count_nodes(right);
943            }
944            NodeKind::Subroutine { body, .. } => {
945                count += self.count_nodes(body);
946            }
947            NodeKind::ExpressionStatement { expression } => {
948                count += self.count_nodes(expression);
949            }
950            NodeKind::If { condition, then_branch, elsif_branches, else_branch } => {
951                count += self.count_nodes(condition);
952                count += self.count_nodes(then_branch);
953                for (cond, branch) in elsif_branches {
954                    count += self.count_nodes(cond);
955                    count += self.count_nodes(branch);
956                }
957                if let Some(else_b) = else_branch {
958                    count += self.count_nodes(else_b);
959                }
960            }
961            NodeKind::While { condition, body, .. } => {
962                count += self.count_nodes(condition);
963                count += self.count_nodes(body);
964            }
965            NodeKind::For { init, condition, update, body, .. } => {
966                if let Some(i) = init {
967                    count += self.count_nodes(i);
968                }
969                if let Some(c) = condition {
970                    count += self.count_nodes(c);
971                }
972                if let Some(u) = update {
973                    count += self.count_nodes(u);
974                }
975                count += self.count_nodes(body);
976            }
977            NodeKind::Foreach { variable, list, body, continue_block } => {
978                count += self.count_nodes(variable);
979                count += self.count_nodes(list);
980                count += self.count_nodes(body);
981                if let Some(cb) = continue_block {
982                    count += self.count_nodes(cb);
983                }
984            }
985            NodeKind::VariableDeclaration { variable, initializer, .. } => {
986                count += self.count_nodes(variable);
987                if let Some(init) = initializer {
988                    count += self.count_nodes(init);
989                }
990            }
991            NodeKind::VariableListDeclaration { variables, initializer, .. } => {
992                for var in variables {
993                    count += self.count_nodes(var);
994                }
995                if let Some(init) = initializer {
996                    count += self.count_nodes(init);
997                }
998            }
999            NodeKind::Assignment { lhs, rhs, .. } => {
1000                count += self.count_nodes(lhs);
1001                count += self.count_nodes(rhs);
1002            }
1003            NodeKind::FunctionCall { args, .. } => {
1004                for arg in args {
1005                    count += self.count_nodes(arg);
1006                }
1007            }
1008            NodeKind::Unary { operand, .. } => {
1009                count += self.count_nodes(operand);
1010            }
1011            _ => {}
1012        }
1013
1014        count
1015    }
1016
1017    /// Determine the priority of a symbol for cache eviction
1018    fn get_symbol_priority(&self, node: &Node) -> SymbolPriority {
1019        match &node.kind {
1020            // Critical symbols needed for LSP features
1021            NodeKind::Package { .. } => SymbolPriority::Critical,
1022            NodeKind::Use { .. } | NodeKind::No { .. } => SymbolPriority::Critical,
1023            NodeKind::Subroutine { .. } => SymbolPriority::Critical,
1024
1025            // High priority symbols for completion and navigation
1026            NodeKind::FunctionCall { .. } => SymbolPriority::High,
1027            NodeKind::Variable { .. } => SymbolPriority::High,
1028            NodeKind::VariableDeclaration { .. } => SymbolPriority::High,
1029
1030            // Medium priority for structural elements
1031            NodeKind::Block { .. } => SymbolPriority::Medium,
1032            NodeKind::If { .. } | NodeKind::While { .. } | NodeKind::For { .. } => {
1033                SymbolPriority::Medium
1034            }
1035            NodeKind::Assignment { .. } => SymbolPriority::Medium,
1036
1037            // Low priority for literals and simple expressions
1038            NodeKind::Number { .. } | NodeKind::String { .. } => SymbolPriority::Low,
1039            NodeKind::Binary { .. } | NodeKind::Unary { .. } => SymbolPriority::Low,
1040
1041            // Default to medium for unknown types
1042            _ => SymbolPriority::Medium,
1043        }
1044    }
1045
1046    /// Get current parse tree
1047    pub fn tree(&self) -> &Node {
1048        &self.root
1049    }
1050
1051    /// Get current source text
1052    pub fn text(&self) -> &str {
1053        &self.source
1054    }
1055
1056    /// Get performance metrics
1057    pub fn metrics(&self) -> &ParseMetrics {
1058        &self.metrics
1059    }
1060
1061    /// Set maximum cache size
1062    pub fn set_cache_max_size(&mut self, max_size: usize) {
1063        self.subtree_cache.set_max_size(max_size);
1064    }
1065}
1066
1067impl SubtreeCache {
1068    fn new(max_size: usize) -> Self {
1069        SubtreeCache {
1070            by_content: HashMap::new(),
1071            by_range: HashMap::new(),
1072            lru: VecDeque::new(),
1073            critical_symbols: HashMap::new(),
1074            max_size,
1075        }
1076    }
1077
1078    fn clear(&mut self) {
1079        self.by_content.clear();
1080        self.by_range.clear();
1081        self.lru.clear();
1082        self.critical_symbols.clear();
1083    }
1084
1085    fn evict_if_needed(&mut self) {
1086        while self.by_content.len() > self.max_size {
1087            if let Some(hash) = self.find_least_important_entry() {
1088                debug!(
1089                    "Evicting cache entry with hash {} (priority: {:?})",
1090                    hash,
1091                    self.critical_symbols.get(&hash).copied().unwrap_or(SymbolPriority::Low)
1092                );
1093                self.by_content.remove(&hash);
1094                self.critical_symbols.remove(&hash);
1095                // Remove from LRU queue
1096                self.lru.retain(|&h| h != hash);
1097            } else {
1098                // Fallback: remove oldest entry if no low priority entries found
1099                if let Some(hash) = self.lru.pop_front() {
1100                    debug!("Fallback eviction for hash {}", hash);
1101                    self.by_content.remove(&hash);
1102                    self.critical_symbols.remove(&hash);
1103                }
1104            }
1105        }
1106    }
1107
1108    /// Find the least important cache entry for eviction.
1109    ///
1110    /// Uses a single pass over the LRU queue (O(n)) instead of sorting all
1111    /// candidates and doing O(n) position lookups per candidate (which was
1112    /// O(n^2) overall). LRU order inherently encodes age: earlier entries
1113    /// are older.
1114    fn find_least_important_entry(&self) -> Option<u64> {
1115        let mut best: Option<(u64, SymbolPriority)> = None;
1116
1117        for &hash in &self.lru {
1118            let priority = self.critical_symbols.get(&hash).copied().unwrap_or(SymbolPriority::Low);
1119
1120            match best {
1121                None => best = Some((hash, priority)),
1122                Some((_, best_priority)) => {
1123                    if priority < best_priority {
1124                        // Lower priority wins (should be evicted first)
1125                        best = Some((hash, priority));
1126                    }
1127                    // If same priority, keep the first (oldest) we found in LRU order
1128                }
1129            }
1130        }
1131
1132        best.map(|(hash, _)| hash)
1133    }
1134
1135    fn set_max_size(&mut self, max_size: usize) {
1136        self.max_size = max_size;
1137        self.evict_if_needed();
1138    }
1139}
1140
1141#[cfg(test)]
1142mod tests {
1143    use super::super::incremental_edit::IncrementalEdit;
1144    use super::*;
1145
1146    #[test]
1147    fn test_incremental_single_token_edit() -> ParseResult<()> {
1148        let source = r#"
1149            my $x = 42;
1150            my $y = 100;
1151            print $x + $y;
1152        "#;
1153
1154        let mut doc = IncrementalDocument::new(source.to_string())?;
1155
1156        // Change 42 to 43
1157        let pos =
1158            source.find("42").ok_or_else(|| perl_parser_core::error::ParseError::SyntaxError {
1159                message: "test source should contain '42'".to_string(),
1160                location: 0,
1161            })?;
1162        let edit = IncrementalEdit::new(pos + 1, pos + 2, "3".to_string());
1163
1164        doc.apply_edit(edit)?;
1165
1166        // Should have high reuse
1167        assert!(doc.metrics.nodes_reused > 0);
1168        assert!(doc.metrics.nodes_reparsed < 5);
1169        assert!(doc.metrics.last_parse_time_ms < 1.0);
1170
1171        Ok(())
1172    }
1173
1174    #[test]
1175    fn test_incremental_multiple_edits() -> ParseResult<()> {
1176        let source = r#"
1177            sub calculate {
1178                my $a = 10;
1179                my $b = 20;
1180                return $a + $b;
1181            }
1182        "#;
1183
1184        let mut doc = IncrementalDocument::new(source.to_string())?;
1185
1186        let mut edits = IncrementalEditSet::new();
1187
1188        // Change 10 to 15
1189        let pos_10 =
1190            source.find("10").ok_or_else(|| perl_parser_core::error::ParseError::SyntaxError {
1191                message: "test source should contain '10'".to_string(),
1192                location: 0,
1193            })?;
1194        edits.add(IncrementalEdit::new(pos_10, pos_10 + 2, "15".to_string()));
1195
1196        // Change 20 to 25
1197        let pos_20 =
1198            source.find("20").ok_or_else(|| perl_parser_core::error::ParseError::SyntaxError {
1199                message: "test source should contain '20'".to_string(),
1200                location: 0,
1201            })?;
1202        edits.add(IncrementalEdit::new(pos_20, pos_20 + 2, "25".to_string()));
1203
1204        doc.apply_edits(&edits)?;
1205
1206        // Cache should preserve critical symbols even during batch edits
1207        let critical_count = doc
1208            .subtree_cache
1209            .critical_symbols
1210            .values()
1211            .filter(|&p| *p == SymbolPriority::Critical)
1212            .count();
1213        assert!(critical_count > 0, "Should preserve critical symbols during batch edits");
1214
1215        // Verify metrics
1216        // Assertions enabled for Issue #255 (Incremental parsing metrics).
1217        // threshold relaxed to 10.0ms for CI environment stability.
1218        assert!(doc.metrics.nodes_reused > 0, "Incremental parsing should reuse nodes");
1219        assert!(
1220            doc.metrics.last_parse_time_ms < 10.0,
1221            "Incremental parse time was {:.2}ms, which exceeds 10.0ms threshold",
1222            doc.metrics.last_parse_time_ms
1223        );
1224
1225        Ok(())
1226    }
1227
1228    #[test]
1229    fn test_cache_eviction() -> ParseResult<()> {
1230        let source = "my $x = 1;";
1231        let doc = IncrementalDocument::new(source.to_string())?;
1232
1233        // Cache should have entries
1234        assert!(!doc.subtree_cache.by_range.is_empty());
1235        assert!(!doc.subtree_cache.by_content.is_empty());
1236
1237        Ok(())
1238    }
1239
1240    #[test]
1241    fn test_symbol_priority_classification() -> ParseResult<()> {
1242        let source = r#"
1243            package TestPkg;
1244            use strict;
1245
1246            sub test_func {
1247                my $var = 42;
1248                if ($var > 0) {
1249                    return $var + 1;
1250                }
1251            }
1252        "#;
1253        let doc = IncrementalDocument::new(source.to_string())?;
1254
1255        // Verify we have different priority levels in cache
1256        let priorities: std::collections::HashSet<_> =
1257            doc.subtree_cache.critical_symbols.values().cloned().collect();
1258
1259        // Should have critical symbols (package, use, sub)
1260        assert!(
1261            priorities.contains(&SymbolPriority::Critical),
1262            "Should classify package/use/sub as critical"
1263        );
1264        // Should have high priority symbols (variables)
1265        assert!(
1266            priorities.contains(&SymbolPriority::High),
1267            "Should classify variables as high priority"
1268        );
1269        // Should have lower priority symbols (literals, operators)
1270        assert!(
1271            priorities.contains(&SymbolPriority::Low)
1272                || priorities.contains(&SymbolPriority::Medium),
1273            "Should have lower priority symbols"
1274        );
1275
1276        Ok(())
1277    }
1278
1279    #[test]
1280    fn test_cache_respects_max_size() -> ParseResult<()> {
1281        let source = "my $x = 1; my $y = 2; my $z = 3;";
1282        let mut doc = IncrementalDocument::new(source.to_string())?;
1283
1284        // Ensure cache starts larger than 1 entry
1285        assert!(doc.subtree_cache.by_content.len() > 1);
1286
1287        // Shrink cache and verify eviction
1288        doc.set_cache_max_size(1);
1289        assert!(doc.subtree_cache.by_content.len() <= 1);
1290
1291        // Applying an edit should not grow the cache beyond max_size
1292        let pos =
1293            source.find('1').ok_or_else(|| perl_parser_core::error::ParseError::SyntaxError {
1294                message: "test source should contain '1'".to_string(),
1295                location: 0,
1296            })?;
1297        let edit = IncrementalEdit::new(pos, pos + 1, "10".to_string());
1298        doc.apply_edit(edit)?;
1299        assert!(doc.subtree_cache.by_content.len() <= 1);
1300
1301        Ok(())
1302    }
1303
1304    #[test]
1305    fn test_cache_priority_preservation() -> ParseResult<()> {
1306        let source = r#"
1307            package MyPackage;
1308            use strict;
1309            use warnings;
1310
1311            sub process {
1312                my $x = 42;
1313                my $y = "hello";
1314                return $x + 1;
1315            }
1316        "#;
1317        let mut doc = IncrementalDocument::new(source.to_string())?;
1318
1319        // Store initial cache state
1320        let initial_cache_size = doc.subtree_cache.by_content.len();
1321        assert!(initial_cache_size > 3, "Should have multiple cached nodes");
1322
1323        // Set very small cache to force aggressive eviction
1324        doc.set_cache_max_size(3);
1325        assert!(doc.subtree_cache.by_content.len() <= 3);
1326
1327        // Check that critical symbols are preserved
1328        let has_critical_symbols = doc
1329            .subtree_cache
1330            .critical_symbols
1331            .values()
1332            .cloned()
1333            .any(|p| p == SymbolPriority::Critical);
1334        assert!(has_critical_symbols, "Should preserve critical symbols like package/use/sub");
1335
1336        // Apply edit and verify critical symbols remain
1337        let pos =
1338            source.find("42").ok_or_else(|| perl_parser_core::error::ParseError::SyntaxError {
1339                message: "test source should contain '42'".to_string(),
1340                location: 0,
1341            })?;
1342        let edit = IncrementalEdit::new(pos, pos + 2, "100".to_string());
1343        doc.apply_edit(edit)?;
1344        assert!(doc.subtree_cache.by_content.len() <= 3);
1345
1346        // Still should have critical symbols after edit
1347        let has_critical_after_edit = doc
1348            .subtree_cache
1349            .critical_symbols
1350            .values()
1351            .cloned()
1352            .any(|p| p == SymbolPriority::Critical);
1353        assert!(has_critical_after_edit, "Should preserve critical symbols after edit");
1354
1355        Ok(())
1356    }
1357
1358    #[test]
1359    fn test_workspace_symbol_cache_preservation() -> ParseResult<()> {
1360        let source = r#"
1361            package TestModule;
1362
1363            sub exported_function { }
1364            sub internal_helper { }
1365
1366            my $global_var = "test";
1367        "#;
1368        let mut doc = IncrementalDocument::new(source.to_string())?;
1369
1370        // Force small cache size
1371        doc.set_cache_max_size(2);
1372
1373        // Verify package declaration is preserved (critical for workspace symbols)
1374        let package_preserved = doc
1375            .subtree_cache
1376            .by_content
1377            .values()
1378            .any(|node| matches!(node.kind, NodeKind::Package { .. }));
1379        assert!(package_preserved, "Package declaration should be preserved for workspace symbols");
1380
1381        Ok(())
1382    }
1383
1384    #[test]
1385    fn test_completion_metadata_preservation() -> ParseResult<()> {
1386        let source = r#"
1387            use Data::Dumper;
1388            use List::Util qw(first max);
1389
1390            sub calculate {
1391                my ($input, $multiplier) = @_;
1392                return $input * $multiplier;
1393            }
1394        "#;
1395        let mut doc = IncrementalDocument::new(source.to_string())?;
1396
1397        // Force cache eviction
1398        doc.set_cache_max_size(4);
1399
1400        // Verify use statements are preserved (critical for completion)
1401        let use_statements_count = doc
1402            .subtree_cache
1403            .by_content
1404            .values()
1405            .filter(|node| matches!(node.kind, NodeKind::Use { .. }))
1406            .count();
1407        assert!(
1408            use_statements_count >= 1,
1409            "Use statements should be preserved for completion metadata"
1410        );
1411
1412        // Verify function definitions are preserved
1413        let function_preserved = doc
1414            .subtree_cache
1415            .by_content
1416            .values()
1417            .any(|node| matches!(node.kind, NodeKind::Subroutine { .. }));
1418        assert!(function_preserved, "Function definitions should be preserved for completion");
1419
1420        Ok(())
1421    }
1422
1423    #[test]
1424    fn test_code_lens_reference_preservation() -> ParseResult<()> {
1425        let source = r#"
1426            package MyClass;
1427
1428            sub new {
1429                my $class = shift;
1430                return bless {}, $class;
1431            }
1432
1433            sub process_data {
1434                my ($self, $data) = @_;
1435                return $self->transform($data);
1436            }
1437        "#;
1438        let mut doc = IncrementalDocument::new(source.to_string())?;
1439
1440        // Force aggressive cache eviction
1441        doc.set_cache_max_size(3);
1442
1443        // Package and subroutines should be preserved for code lens reference counting
1444        let critical_nodes = doc
1445            .subtree_cache
1446            .by_content
1447            .values()
1448            .filter(|node| {
1449                matches!(node.kind, NodeKind::Package { .. } | NodeKind::Subroutine { .. })
1450            })
1451            .count();
1452        assert!(critical_nodes >= 2, "Should preserve package and key subroutines for code lens");
1453
1454        Ok(())
1455    }
1456}
perl_parser/incremental/incremental_document.rs

perl_parser/incremental/
incremental_document.rs