perl_parser/incremental/
incremental_advanced_reuse.rs

1//! Advanced tree reuse algorithms for incremental parsing
2//!
3//! This module implements sophisticated AST node reuse strategies that go beyond
4//! simple value matching to achieve high node reuse rates even for complex edits.
5//!
6//! ## Key Features
7//!
8//! - **Structural similarity analysis** - Compare AST subtree patterns
9//! - **Position-aware reuse** - Understand which nodes can be safely repositioned
10//! - **Content-based hashing** - Fast comparison of subtree equivalence
11//! - **Incremental node mapping** - Efficient lookup of reusable nodes
12//! - **Advanced validation** - Ensure reused nodes maintain correctness
13//!
14//! ## Performance Targets
15//!
16//! - **≥85% node reuse** for simple value edits
17//! - **≥70% node reuse** for structural modifications
18//! - **≥50% node reuse** for complex multi-edit scenarios
19//! - **<500µs processing** for reuse analysis on typical documents
20
21use perl_parser_core::{
22    ast::{Node, NodeKind},
23    edit::EditSet,
24    position::{Position, Range},
25};
26use std::collections::{HashMap, HashSet};
27use std::hash::{DefaultHasher, Hash, Hasher};
28
29/// Advanced node reuse analyzer with sophisticated matching algorithms
30#[derive(Debug)]
31pub struct AdvancedReuseAnalyzer {
32    /// Cache of node structural hashes for fast comparison
33    node_hashes: HashMap<usize, u64>,
34    /// Mapping of positions to potentially reusable nodes
35    position_map: HashMap<usize, Vec<NodeCandidate>>,
36    /// Set of nodes that are known to be affected by edits
37    affected_nodes: HashSet<usize>,
38    /// Statistics for reuse analysis
39    pub analysis_stats: ReuseAnalysisStats,
40}
41
42/// Statistics tracking reuse analysis performance and effectiveness
43#[derive(Debug, Default, Clone)]
44pub struct ReuseAnalysisStats {
45    pub nodes_analyzed: usize,
46    pub structural_matches: usize,
47    pub content_matches: usize,
48    pub position_adjustments: usize,
49    pub reuse_candidates_found: usize,
50    pub validation_passes: usize,
51    pub validation_failures: usize,
52}
53
54/// A candidate node for reuse with metadata about its reusability
55#[derive(Debug, Clone)]
56#[allow(dead_code)] // Fields used by future advanced matching strategies
57struct NodeCandidate {
58    node: Node,
59    structural_hash: u64,
60    confidence_score: f64,
61    position_delta: isize,
62    reuse_type: ReuseType,
63}
64
65/// Types of reuse strategies available for nodes
66#[derive(Debug, Clone, PartialEq)]
67#[allow(dead_code)] // Used by future advanced matching strategies
68pub enum ReuseType {
69    /// Direct reuse - node unchanged
70    Direct,
71    /// Position shift - same content, different position
72    PositionShift,
73    /// Content update - same structure, updated values
74    ContentUpdate,
75    /// Structural equivalent - same pattern with different details
76    StructuralEquivalent,
77}
78
79/// Configuration for reuse analysis behavior
80#[derive(Debug, Clone)]
81pub struct ReuseConfig {
82    /// Minimum confidence score for reuse (0.0-1.0)
83    pub min_confidence: f64,
84    /// Maximum position shift allowed for reuse
85    pub max_position_shift: usize,
86    /// Enable aggressive structural matching
87    pub aggressive_structural_matching: bool,
88    /// Enable content-based reuse for literals
89    pub enable_content_reuse: bool,
90    /// Maximum depth for recursive analysis
91    pub max_analysis_depth: usize,
92}
93
94impl Default for ReuseConfig {
95    fn default() -> Self {
96        ReuseConfig {
97            min_confidence: 0.75,
98            max_position_shift: 1000,
99            aggressive_structural_matching: true,
100            enable_content_reuse: true,
101            max_analysis_depth: 10,
102        }
103    }
104}
105
106impl Default for AdvancedReuseAnalyzer {
107    fn default() -> Self {
108        Self::new()
109    }
110}
111
112impl AdvancedReuseAnalyzer {
113    /// Create a new reuse analyzer with default configuration
114    pub fn new() -> Self {
115        AdvancedReuseAnalyzer {
116            node_hashes: HashMap::new(),
117            position_map: HashMap::new(),
118            affected_nodes: HashSet::new(),
119            analysis_stats: ReuseAnalysisStats::default(),
120        }
121    }
122
123    /// Create analyzer with custom configuration
124    pub fn with_config(_config: ReuseConfig) -> Self {
125        // Store config for future use if needed
126        Self::new()
127    }
128
129    /// Analyze potential node reuse between old and new trees
130    ///
131    /// Returns a mapping of old node positions to reuse strategies,
132    /// enabling intelligent tree reconstruction with maximum node reuse.
133    pub fn analyze_reuse_opportunities(
134        &mut self,
135        old_tree: &Node,
136        new_tree: &Node,
137        edits: &EditSet,
138        config: &ReuseConfig,
139    ) -> ReuseAnalysisResult {
140        self.analysis_stats = ReuseAnalysisStats::default();
141
142        // Reset internal state
143        self.node_hashes.clear();
144        self.position_map.clear();
145        self.affected_nodes.clear();
146
147        // Build structural analysis of both trees
148        let old_analysis = self.build_tree_analysis(old_tree, config);
149        let new_analysis = self.build_tree_analysis(new_tree, config);
150
151        // Identify affected regions from edits
152        self.identify_affected_nodes(old_tree, edits);
153
154        // Find reuse candidates using multiple strategies
155        let mut reuse_map = HashMap::new();
156
157        // Strategy 1: Direct structural matching
158        self.find_direct_structural_matches(&old_analysis, &new_analysis, &mut reuse_map, config);
159
160        // Strategy 2: Position-shifted matching
161        self.find_position_shifted_matches(&old_analysis, &new_analysis, &mut reuse_map, config);
162
163        // Strategy 3: Content-updated matching
164        if config.enable_content_reuse {
165            self.find_content_updated_matches(&old_analysis, &new_analysis, &mut reuse_map, config);
166        }
167
168        // Strategy 4: Aggressive structural matching
169        if config.aggressive_structural_matching {
170            self.find_aggressive_structural_matches(
171                &old_analysis,
172                &new_analysis,
173                &mut reuse_map,
174                config,
175            );
176        }
177
178        // Validate reuse candidates and calculate confidence scores
179        let validated_reuse_map =
180            self.validate_reuse_candidates(reuse_map, old_tree, new_tree, config);
181
182        // Calculate final statistics
183        let total_old_nodes = self.count_nodes(old_tree);
184        let total_new_nodes = self.count_nodes(new_tree);
185        let reused_nodes = validated_reuse_map.len();
186        let reuse_percentage = if total_old_nodes > 0 {
187            (reused_nodes as f64 / total_old_nodes as f64) * 100.0
188        } else {
189            0.0
190        };
191
192        ReuseAnalysisResult {
193            reuse_map: validated_reuse_map,
194            total_old_nodes,
195            total_new_nodes,
196            reused_nodes,
197            reuse_percentage,
198            analysis_stats: self.analysis_stats.clone(),
199        }
200    }
201
202    /// Build comprehensive analysis of tree structure
203    fn build_tree_analysis(&mut self, tree: &Node, config: &ReuseConfig) -> TreeAnalysis {
204        let mut analysis = TreeAnalysis::new();
205        self.analyze_node_recursive(tree, &mut analysis, 0, config);
206        analysis
207    }
208
209    /// Recursively analyze nodes to build structural understanding
210    fn analyze_node_recursive(
211        &mut self,
212        node: &Node,
213        analysis: &mut TreeAnalysis,
214        depth: usize,
215        config: &ReuseConfig,
216    ) {
217        if depth > config.max_analysis_depth {
218            return;
219        }
220
221        self.analysis_stats.nodes_analyzed += 1;
222
223        // Calculate structural hash
224        let structural_hash = self.calculate_structural_hash(node);
225        self.node_hashes.insert(node.location.start, structural_hash);
226
227        // Create node info for analysis
228        let node_info = NodeAnalysisInfo {
229            node: node.clone(),
230            structural_hash,
231            depth,
232            children_count: self.get_children_count(node),
233            content_hash: self.calculate_content_hash(node),
234        };
235
236        analysis.add_node_info(node.location.start, node_info);
237
238        // Add to position map
239        let candidate = NodeCandidate {
240            node: node.clone(),
241            structural_hash,
242            confidence_score: 1.0, // Will be refined during analysis
243            position_delta: 0,
244            reuse_type: ReuseType::Direct,
245        };
246
247        self.position_map.entry(node.location.start).or_default().push(candidate);
248
249        // Recurse into children
250        match &node.kind {
251            NodeKind::Program { statements } | NodeKind::Block { statements } => {
252                for stmt in statements {
253                    self.analyze_node_recursive(stmt, analysis, depth + 1, config);
254                }
255            }
256            NodeKind::VariableDeclaration { variable, initializer, .. } => {
257                self.analyze_node_recursive(variable, analysis, depth + 1, config);
258                if let Some(init) = initializer {
259                    self.analyze_node_recursive(init, analysis, depth + 1, config);
260                }
261            }
262            NodeKind::Binary { left, right, .. } => {
263                self.analyze_node_recursive(left, analysis, depth + 1, config);
264                self.analyze_node_recursive(right, analysis, depth + 1, config);
265            }
266            NodeKind::Unary { operand, .. } => {
267                self.analyze_node_recursive(operand, analysis, depth + 1, config);
268            }
269            NodeKind::FunctionCall { args, .. } => {
270                for arg in args {
271                    self.analyze_node_recursive(arg, analysis, depth + 1, config);
272                }
273            }
274            NodeKind::If { condition, then_branch, elsif_branches, else_branch } => {
275                self.analyze_node_recursive(condition, analysis, depth + 1, config);
276                self.analyze_node_recursive(then_branch, analysis, depth + 1, config);
277                for (cond, branch) in elsif_branches {
278                    self.analyze_node_recursive(cond, analysis, depth + 1, config);
279                    self.analyze_node_recursive(branch, analysis, depth + 1, config);
280                }
281                if let Some(branch) = else_branch {
282                    self.analyze_node_recursive(branch, analysis, depth + 1, config);
283                }
284            }
285            _ => {} // Leaf nodes
286        }
287    }
288
289    /// Identify nodes affected by edits
290    fn identify_affected_nodes(&mut self, tree: &Node, edits: &EditSet) {
291        for range in edits.coalesced_affected_ranges() {
292            self.mark_affected_nodes_in_range(tree, range.start.byte, range.end.byte);
293        }
294    }
295
296    /// Mark nodes as affected if they overlap with edit ranges
297    fn mark_affected_nodes_in_range(&mut self, node: &Node, start: usize, end: usize) {
298        let node_range = Range::from(node.location);
299        let edit_range = Range::new(Position::new(start, 0, 0), Position::new(end, 0, 0));
300
301        if !node_range.overlaps(&edit_range) {
302            return;
303        }
304        self.affected_nodes.insert(node.location.start);
305
306        // Recurse into children
307        match &node.kind {
308            NodeKind::Program { statements } | NodeKind::Block { statements } => {
309                for stmt in statements {
310                    self.mark_affected_nodes_in_range(stmt, start, end);
311                }
312            }
313            NodeKind::VariableDeclaration { variable, initializer, .. } => {
314                self.mark_affected_nodes_in_range(variable, start, end);
315                if let Some(init) = initializer {
316                    self.mark_affected_nodes_in_range(init, start, end);
317                }
318            }
319            NodeKind::Binary { left, right, .. } => {
320                self.mark_affected_nodes_in_range(left, start, end);
321                self.mark_affected_nodes_in_range(right, start, end);
322            }
323            _ => {} // Handle other node types as needed
324        }
325    }
326
327    /// Find direct structural matches between trees
328    fn find_direct_structural_matches(
329        &mut self,
330        old_analysis: &TreeAnalysis,
331        new_analysis: &TreeAnalysis,
332        reuse_map: &mut HashMap<usize, ReuseStrategy>,
333        config: &ReuseConfig,
334    ) {
335        let mut used_target_positions: HashSet<usize> =
336            reuse_map.values().map(|strategy| strategy.target_position).collect();
337
338        for (old_pos, old_info) in &old_analysis.node_info {
339            // Skip affected nodes for direct matching
340            if self.affected_nodes.contains(old_pos) {
341                continue;
342            }
343
344            // Look for exact structural matches in new tree
345            for (new_pos, new_info) in &new_analysis.node_info {
346                if used_target_positions.contains(new_pos) {
347                    continue;
348                }
349
350                if old_info.structural_hash == new_info.structural_hash
351                    && old_info.children_count == new_info.children_count
352                {
353                    let confidence = self.calculate_match_confidence(old_info, new_info);
354                    if confidence >= config.min_confidence {
355                        reuse_map.insert(
356                            *old_pos,
357                            ReuseStrategy {
358                                target_position: *new_pos,
359                                reuse_type: ReuseType::Direct,
360                                confidence_score: confidence,
361                                position_adjustment: (*new_pos as isize) - (*old_pos as isize),
362                            },
363                        );
364                        used_target_positions.insert(*new_pos);
365                        self.analysis_stats.structural_matches += 1;
366                        break; // Use first good match
367                    }
368                }
369            }
370        }
371    }
372
373    /// Find position-shifted matches (same content, different location)
374    fn find_position_shifted_matches(
375        &mut self,
376        old_analysis: &TreeAnalysis,
377        new_analysis: &TreeAnalysis,
378        reuse_map: &mut HashMap<usize, ReuseStrategy>,
379        config: &ReuseConfig,
380    ) {
381        let mut used_target_positions: HashSet<usize> =
382            reuse_map.values().map(|strategy| strategy.target_position).collect();
383
384        for (old_pos, old_info) in &old_analysis.node_info {
385            if reuse_map.contains_key(old_pos) {
386                continue;
387            }
388
389            let mut best_match: Option<(usize, f64)> = None;
390            for (new_pos, new_info) in &new_analysis.node_info {
391                if used_target_positions.contains(new_pos) {
392                    continue;
393                }
394
395                if old_info.content_hash == new_info.content_hash
396                    && old_info.structural_hash == new_info.structural_hash
397                {
398                    let position_shift = (*new_pos as isize - *old_pos as isize).unsigned_abs();
399                    if !self.is_position_shift_candidate_safe(
400                        old_info,
401                        new_info,
402                        position_shift,
403                        config,
404                    ) {
405                        continue;
406                    }
407
408                    let confidence = self.calculate_shifted_match_confidence(
409                        old_info,
410                        new_info,
411                        position_shift,
412                        config,
413                    );
414                    if confidence >= config.min_confidence
415                        && best_match
416                            .as_ref()
417                            .is_none_or(|&(_, best_score)| confidence > best_score)
418                    {
419                        best_match = Some((*new_pos, confidence));
420                    }
421                }
422            }
423
424            if let Some((best_pos, confidence)) = best_match {
425                reuse_map.insert(
426                    *old_pos,
427                    ReuseStrategy {
428                        target_position: best_pos,
429                        reuse_type: ReuseType::PositionShift,
430                        confidence_score: confidence,
431                        position_adjustment: (best_pos as isize) - (*old_pos as isize),
432                    },
433                );
434                used_target_positions.insert(best_pos);
435                self.analysis_stats.position_adjustments += 1;
436            }
437        }
438    }
439
440    /// Find content-updated matches (structure same, values changed)
441    fn find_content_updated_matches(
442        &mut self,
443        old_analysis: &TreeAnalysis,
444        new_analysis: &TreeAnalysis,
445        reuse_map: &mut HashMap<usize, ReuseStrategy>,
446        config: &ReuseConfig,
447    ) {
448        for (old_pos, old_info) in &old_analysis.node_info {
449            if reuse_map.contains_key(old_pos) {
450                continue;
451            }
452
453            // For leaf nodes, check if structure matches but content differs
454            if old_info.children_count == 0 {
455                for (new_pos, new_info) in &new_analysis.node_info {
456                    if old_info.structural_hash == new_info.structural_hash
457                        && old_info.content_hash != new_info.content_hash
458                        && self.are_compatible_for_content_update(&old_info.node, &new_info.node)
459                    {
460                        let confidence = 0.8; // Content updates get medium confidence
461                        if confidence >= config.min_confidence {
462                            reuse_map.insert(
463                                *old_pos,
464                                ReuseStrategy {
465                                    target_position: *new_pos,
466                                    reuse_type: ReuseType::ContentUpdate,
467                                    confidence_score: confidence,
468                                    position_adjustment: (*new_pos as isize) - (*old_pos as isize),
469                                },
470                            );
471                            self.analysis_stats.content_matches += 1;
472                            break;
473                        }
474                    }
475                }
476            }
477        }
478    }
479
480    /// Find aggressive structural matches using pattern analysis
481    fn find_aggressive_structural_matches(
482        &mut self,
483        old_analysis: &TreeAnalysis,
484        new_analysis: &TreeAnalysis,
485        reuse_map: &mut HashMap<usize, ReuseStrategy>,
486        config: &ReuseConfig,
487    ) {
488        // This is the most sophisticated matching - look for structural patterns
489        // even when exact hashes don't match
490        for (old_pos, old_info) in &old_analysis.node_info {
491            if reuse_map.contains_key(old_pos) {
492                continue;
493            }
494
495            let mut best_match: Option<(usize, f64)> = None;
496
497            for (new_pos, new_info) in &new_analysis.node_info {
498                if old_info.children_count > 0
499                    && self.get_children_count(&old_info.node)
500                        != self.get_children_count(&new_info.node)
501                {
502                    continue;
503                }
504
505                // Compare structural similarity
506                let similarity =
507                    self.calculate_structural_similarity(&old_info.node, &new_info.node);
508                if similarity >= config.min_confidence * 0.8 {
509                    // Slightly lower threshold for aggressive matching
510                    if best_match.as_ref().is_none_or(|&(_, s)| similarity > s) {
511                        best_match = Some((*new_pos, similarity));
512                    }
513                }
514            }
515
516            if let Some((best_pos, confidence)) = best_match {
517                if confidence >= config.min_confidence * 0.7 {
518                    // Final threshold check
519                    reuse_map.insert(
520                        *old_pos,
521                        ReuseStrategy {
522                            target_position: best_pos,
523                            reuse_type: ReuseType::StructuralEquivalent,
524                            confidence_score: confidence,
525                            position_adjustment: (best_pos as isize) - (*old_pos as isize),
526                        },
527                    );
528                    self.analysis_stats.reuse_candidates_found += 1;
529                }
530            }
531        }
532    }
533
534    /// Validate reuse candidates to ensure correctness
535    fn validate_reuse_candidates(
536        &mut self,
537        candidates: HashMap<usize, ReuseStrategy>,
538        old_tree: &Node,
539        new_tree: &Node,
540        config: &ReuseConfig,
541    ) -> HashMap<usize, ReuseStrategy> {
542        let mut validated = HashMap::new();
543
544        for (old_pos, strategy) in candidates {
545            if self.validate_reuse_strategy(&strategy, old_tree, new_tree, config) {
546                validated.insert(old_pos, strategy);
547                self.analysis_stats.validation_passes += 1;
548            } else {
549                self.analysis_stats.validation_failures += 1;
550            }
551        }
552
553        validated
554    }
555
556    /// Calculate structural hash for fast comparison
557    fn calculate_structural_hash(&self, node: &Node) -> u64 {
558        let mut hasher = DefaultHasher::new();
559
560        // Hash node kind discriminant
561        std::mem::discriminant(&node.kind).hash(&mut hasher);
562
563        // Hash structural properties based on node type
564        match &node.kind {
565            NodeKind::Program { statements } => {
566                statements.len().hash(&mut hasher);
567                "program".hash(&mut hasher);
568            }
569            NodeKind::Block { statements } => {
570                statements.len().hash(&mut hasher);
571                "block".hash(&mut hasher);
572            }
573            NodeKind::VariableDeclaration { declarator, .. } => {
574                declarator.hash(&mut hasher);
575                "vardecl".hash(&mut hasher);
576            }
577            NodeKind::Binary { op, .. } => {
578                op.hash(&mut hasher);
579                "binary".hash(&mut hasher);
580            }
581            NodeKind::Unary { op, .. } => {
582                op.hash(&mut hasher);
583                "unary".hash(&mut hasher);
584            }
585            NodeKind::FunctionCall { name, args } => {
586                name.hash(&mut hasher);
587                args.len().hash(&mut hasher);
588                "funccall".hash(&mut hasher);
589            }
590            NodeKind::Number { .. } => "number".hash(&mut hasher),
591            NodeKind::String { interpolated, .. } => {
592                interpolated.hash(&mut hasher);
593                "string".hash(&mut hasher);
594            }
595            NodeKind::Variable { sigil, .. } => {
596                sigil.hash(&mut hasher);
597                "variable".hash(&mut hasher);
598            }
599            NodeKind::Identifier { .. } => "identifier".hash(&mut hasher),
600            _ => "other".hash(&mut hasher),
601        }
602
603        hasher.finish()
604    }
605
606    /// Calculate content-based hash for value comparison
607    fn calculate_content_hash(&self, node: &Node) -> u64 {
608        let mut hasher = DefaultHasher::new();
609
610        match &node.kind {
611            NodeKind::Number { value } => value.hash(&mut hasher),
612            NodeKind::String { value, .. } => value.hash(&mut hasher),
613            NodeKind::Variable { name, .. } => name.hash(&mut hasher),
614            NodeKind::Identifier { name } => name.hash(&mut hasher),
615            _ => {
616                // For non-leaf nodes, hash is based on structure
617                self.calculate_structural_hash(node).hash(&mut hasher);
618            }
619        }
620
621        hasher.finish()
622    }
623
624    /// Get count of direct children for a node
625    fn get_children_count(&self, node: &Node) -> usize {
626        match &node.kind {
627            NodeKind::Program { statements } | NodeKind::Block { statements } => statements.len(),
628            NodeKind::VariableDeclaration { initializer, .. } => {
629                if initializer.is_some() { 2 } else { 1 } // variable + optional initializer
630            }
631            NodeKind::Binary { .. } => 2, // left + right
632            NodeKind::Unary { .. } => 1,  // operand
633            NodeKind::FunctionCall { args, .. } => args.len(),
634            NodeKind::If { elsif_branches, else_branch, .. } => {
635                2 + elsif_branches.len() * 2 + if else_branch.is_some() { 1 } else { 0 }
636            }
637            _ => 0, // Leaf nodes
638        }
639    }
640
641    /// Calculate confidence score for a potential match
642    fn calculate_match_confidence(
643        &self,
644        old_info: &NodeAnalysisInfo,
645        new_info: &NodeAnalysisInfo,
646    ) -> f64 {
647        let mut confidence = 0.0f64;
648
649        // Structural match bonus
650        if old_info.structural_hash == new_info.structural_hash {
651            confidence += 0.4;
652        }
653
654        // Content match bonus
655        if old_info.content_hash == new_info.content_hash {
656            confidence += 0.3;
657        }
658
659        // Children count match bonus
660        if old_info.children_count == new_info.children_count {
661            confidence += 0.2;
662        }
663
664        // Depth similarity bonus
665        let depth_diff = (old_info.depth as isize - new_info.depth as isize).abs();
666        if depth_diff == 0 {
667            confidence += 0.1;
668        } else if depth_diff <= 2 {
669            confidence += 0.05;
670        }
671
672        confidence.min(1.0)
673    }
674
675    /// Calculate structural similarity between two nodes
676    fn calculate_structural_similarity(&self, old_node: &Node, new_node: &Node) -> f64 {
677        // This is a more sophisticated comparison than hash equality
678        let mut similarity = 0.0;
679
680        // Base similarity from node type
681        if std::mem::discriminant(&old_node.kind) == std::mem::discriminant(&new_node.kind) {
682            similarity += 0.5;
683
684            // Additional similarity based on node-specific properties
685            match (&old_node.kind, &new_node.kind) {
686                (NodeKind::Program { statements: s1 }, NodeKind::Program { statements: s2 }) => {
687                    let len_similarity = 1.0
688                        - ((s1.len() as f64 - s2.len() as f64).abs()
689                            / (s1.len().max(s2.len()) as f64));
690                    similarity += 0.3 * len_similarity;
691                }
692                (NodeKind::Binary { op: op1, .. }, NodeKind::Binary { op: op2, .. }) => {
693                    if op1 == op2 {
694                        similarity += 0.4;
695                    }
696                }
697                (
698                    NodeKind::FunctionCall { name: n1, args: a1 },
699                    NodeKind::FunctionCall { name: n2, args: a2 },
700                ) => {
701                    if n1 == n2 {
702                        similarity += 0.3;
703                    }
704                    let arg_similarity = 1.0
705                        - ((a1.len() as f64 - a2.len() as f64).abs()
706                            / (a1.len().max(a2.len()) as f64));
707                    similarity += 0.2 * arg_similarity;
708                }
709                _ => {
710                    similarity += 0.2; // Generic bonus for same type
711                }
712            }
713        }
714
715        similarity.min(1.0)
716    }
717
718    /// Check if two nodes are compatible for content updates
719    fn are_compatible_for_content_update(&self, old_node: &Node, new_node: &Node) -> bool {
720        match (&old_node.kind, &new_node.kind) {
721            (NodeKind::Number { .. }, NodeKind::Number { .. }) => true,
722            (
723                NodeKind::String { interpolated: i1, .. },
724                NodeKind::String { interpolated: i2, .. },
725            ) => i1 == i2,
726            (NodeKind::Variable { sigil: s1, .. }, NodeKind::Variable { sigil: s2, .. }) => {
727                s1 == s2
728            }
729            (NodeKind::Identifier { .. }, NodeKind::Identifier { .. }) => true,
730            _ => false,
731        }
732    }
733
734    fn is_position_shift_candidate_safe(
735        &self,
736        old_info: &NodeAnalysisInfo,
737        new_info: &NodeAnalysisInfo,
738        position_shift: usize,
739        config: &ReuseConfig,
740    ) -> bool {
741        if position_shift > config.max_position_shift {
742            return false;
743        }
744
745        let old_is_container = self.is_container_node(&old_info.node);
746        let new_is_container = self.is_container_node(&new_info.node);
747        if old_is_container != new_is_container {
748            return false;
749        }
750
751        if old_is_container {
752            let max_container_shift = config.max_position_shift / 4;
753            if position_shift > max_container_shift {
754                return false;
755            }
756
757            let depth_diff = (old_info.depth as isize - new_info.depth as isize).unsigned_abs();
758            if depth_diff > 1 || old_info.children_count != new_info.children_count {
759                return false;
760            }
761        }
762
763        true
764    }
765
766    fn calculate_shifted_match_confidence(
767        &self,
768        old_info: &NodeAnalysisInfo,
769        new_info: &NodeAnalysisInfo,
770        position_shift: usize,
771        config: &ReuseConfig,
772    ) -> f64 {
773        let base_confidence = self.calculate_match_confidence(old_info, new_info);
774        let shift_ratio = position_shift as f64 / config.max_position_shift.max(1) as f64;
775
776        let shift_penalty = if self.is_container_node(&old_info.node) {
777            0.45 * shift_ratio.min(1.0)
778        } else if self.is_content_stable_leaf(&old_info.node) {
779            0.12 * shift_ratio.min(1.0)
780        } else {
781            0.30 * shift_ratio.min(1.0)
782        };
783
784        (base_confidence - shift_penalty).clamp(0.0, 1.0)
785    }
786
787    fn is_content_stable_leaf(&self, node: &Node) -> bool {
788        matches!(
789            node.kind,
790            NodeKind::Number { .. }
791                | NodeKind::String { .. }
792                | NodeKind::Identifier { .. }
793                | NodeKind::Variable { .. }
794        )
795    }
796
797    fn is_container_node(&self, node: &Node) -> bool {
798        matches!(node.kind, NodeKind::Program { .. } | NodeKind::Block { .. } | NodeKind::If { .. })
799    }
800
801    /// Validate a reuse strategy for correctness
802    fn validate_reuse_strategy(
803        &self,
804        _strategy: &ReuseStrategy,
805        _old_tree: &Node,
806        _new_tree: &Node,
807        _config: &ReuseConfig,
808    ) -> bool {
809        // Implement validation logic:
810        // - Check that reused nodes maintain parent-child relationships
811        // - Verify position adjustments are reasonable
812        // - Ensure content updates are semantically valid
813        // For now, accept all strategies (can be enhanced with specific validation)
814        true
815    }
816
817    /// Count total nodes in a tree
818    fn count_nodes(&self, node: &Node) -> usize {
819        let mut count = 1;
820
821        match &node.kind {
822            NodeKind::Program { statements } | NodeKind::Block { statements } => {
823                for stmt in statements {
824                    count += self.count_nodes(stmt);
825                }
826            }
827            NodeKind::VariableDeclaration { variable, initializer, .. } => {
828                count += self.count_nodes(variable);
829                if let Some(init) = initializer {
830                    count += self.count_nodes(init);
831                }
832            }
833            NodeKind::Binary { left, right, .. } => {
834                count += self.count_nodes(left);
835                count += self.count_nodes(right);
836            }
837            NodeKind::Unary { operand, .. } => {
838                count += self.count_nodes(operand);
839            }
840            NodeKind::FunctionCall { args, .. } => {
841                for arg in args {
842                    count += self.count_nodes(arg);
843                }
844            }
845            NodeKind::If { condition, then_branch, elsif_branches, else_branch } => {
846                count += self.count_nodes(condition);
847                count += self.count_nodes(then_branch);
848                for (cond, branch) in elsif_branches {
849                    count += self.count_nodes(cond);
850                    count += self.count_nodes(branch);
851                }
852                if let Some(branch) = else_branch {
853                    count += self.count_nodes(branch);
854                }
855            }
856            _ => {} // Leaf nodes
857        }
858
859        count
860    }
861
862    /// Map an old-tree byte position to its corresponding position in the new
863    /// tree, using the supplied [`EditSet`] to apply byte shifts.
864    ///
865    /// Semantics:
866    /// - If `old_pos` precedes the first edit's `start_byte`, returns `old_pos`
867    ///   unchanged.
868    /// - If `old_pos` falls inside an edit's old range
869    ///   `[start_byte, old_end_byte)`, returns that edit's `new_end_byte`
870    ///   (i.e. the position is consumed by the edit and snaps to the edit's
871    ///   new boundary).
872    /// - Otherwise the position is shifted by the cumulative byte shift of all
873    ///   prior edits whose `old_end_byte <= old_pos`.
874    ///
875    /// Edits in [`EditSet`] are sorted by `start_byte`, so iteration short-
876    /// circuits as soon as the next edit starts past `old_pos`.
877    pub fn map_old_position_to_new(&self, old_pos: usize, edits: &EditSet) -> usize {
878        let mut shift: isize = 0;
879        for edit in edits.edits() {
880            if old_pos < edit.start_byte {
881                break;
882            }
883            if old_pos < edit.old_end_byte {
884                // Position is consumed by this edit; snap to its new end.
885                return edit.new_end_byte;
886            }
887            shift += edit.byte_shift();
888        }
889        let signed = old_pos as isize + shift;
890        if signed < 0 { 0 } else { signed as usize }
891    }
892
893    /// Attempt to register an `old_pos -> new_pos` reuse claim, enforcing the
894    /// one-to-one invariant: at most one old position may map to any given
895    /// new position.
896    ///
897    /// Returns `true` if the registration was inserted, `false` if some other
898    /// `old_pos` has already claimed the same `new_pos` (in which case the
899    /// existing claim is left unchanged).
900    pub fn try_register_match(
901        &self,
902        reuse_map: &mut HashMap<usize, ReuseStrategy>,
903        old_pos: usize,
904        new_pos: usize,
905        reuse_type: ReuseType,
906        confidence: f64,
907    ) -> bool {
908        if reuse_map.values().any(|s| s.target_position == new_pos) {
909            return false;
910        }
911        reuse_map.insert(
912            old_pos,
913            ReuseStrategy {
914                target_position: new_pos,
915                reuse_type,
916                confidence_score: confidence,
917                position_adjustment: (new_pos as isize) - (old_pos as isize),
918            },
919        );
920        true
921    }
922}
923
924/// Comprehensive analysis of a tree structure
925#[derive(Debug)]
926struct TreeAnalysis {
927    node_info: HashMap<usize, NodeAnalysisInfo>,
928}
929
930impl TreeAnalysis {
931    fn new() -> Self {
932        TreeAnalysis { node_info: HashMap::new() }
933    }
934
935    fn add_node_info(&mut self, position: usize, info: NodeAnalysisInfo) {
936        self.node_info.insert(position, info);
937    }
938}
939
940/// Detailed information about a node for reuse analysis
941#[derive(Debug, Clone)]
942struct NodeAnalysisInfo {
943    node: Node,
944    structural_hash: u64,
945    content_hash: u64,
946    depth: usize,
947    children_count: usize,
948}
949
950/// Strategy for reusing a node from old tree to new tree
951#[derive(Debug, Clone)]
952pub struct ReuseStrategy {
953    pub target_position: usize,
954    pub reuse_type: ReuseType,
955    pub confidence_score: f64,
956    pub position_adjustment: isize,
957}
958
959/// Result of reuse analysis with comprehensive metrics
960#[derive(Debug)]
961pub struct ReuseAnalysisResult {
962    pub reuse_map: HashMap<usize, ReuseStrategy>,
963    pub total_old_nodes: usize,
964    pub total_new_nodes: usize,
965    pub reused_nodes: usize,
966    pub reuse_percentage: f64,
967    pub analysis_stats: ReuseAnalysisStats,
968}
969
970impl ReuseAnalysisResult {
971    /// Check if reuse analysis achieved target efficiency
972    pub fn meets_efficiency_target(&self, target_percentage: f64) -> bool {
973        self.reuse_percentage >= target_percentage
974    }
975
976    /// Get a summary of the analysis performance
977    pub fn performance_summary(&self) -> String {
978        format!(
979            "Reuse Analysis: {:.1}% efficiency ({}/{} nodes), {} structural matches, {} position adjustments",
980            self.reuse_percentage,
981            self.reused_nodes,
982            self.total_old_nodes,
983            self.analysis_stats.structural_matches,
984            self.analysis_stats.position_adjustments
985        )
986    }
987}
988
989#[cfg(test)]
990mod tests {
991    use super::*;
992    use perl_parser_core::{SourceLocation, ast::Node};
993
994    #[test]
995    fn test_advanced_reuse_analyzer_creation() {
996        let analyzer = AdvancedReuseAnalyzer::new();
997        assert_eq!(analyzer.analysis_stats.nodes_analyzed, 0);
998    }
999
1000    #[test]
1001    fn test_structural_hash_calculation() {
1002        let analyzer = AdvancedReuseAnalyzer::new();
1003
1004        // Create sample nodes
1005        let node1 = Node::new(
1006            NodeKind::Number { value: "42".to_string() },
1007            SourceLocation { start: 0, end: 2 },
1008        );
1009
1010        let node2 = Node::new(
1011            NodeKind::Number { value: "99".to_string() },
1012            SourceLocation { start: 0, end: 2 },
1013        );
1014
1015        let hash1 = analyzer.calculate_structural_hash(&node1);
1016        let hash2 = analyzer.calculate_structural_hash(&node2);
1017
1018        // Same structure should have same hash
1019        assert_eq!(hash1, hash2, "Numbers should have same structural hash regardless of value");
1020    }
1021
1022    #[test]
1023    fn test_content_hash_differs_for_different_values() {
1024        let analyzer = AdvancedReuseAnalyzer::new();
1025
1026        let node1 = Node::new(
1027            NodeKind::Number { value: "42".to_string() },
1028            SourceLocation { start: 0, end: 2 },
1029        );
1030
1031        let node2 = Node::new(
1032            NodeKind::Number { value: "99".to_string() },
1033            SourceLocation { start: 0, end: 2 },
1034        );
1035
1036        let hash1 = analyzer.calculate_content_hash(&node1);
1037        let hash2 = analyzer.calculate_content_hash(&node2);
1038
1039        assert_ne!(hash1, hash2, "Different values should have different content hashes");
1040    }
1041
1042    #[test]
1043    fn test_children_count_calculation() {
1044        let analyzer = AdvancedReuseAnalyzer::new();
1045
1046        // Leaf node
1047        let leaf = Node::new(
1048            NodeKind::Number { value: "42".to_string() },
1049            SourceLocation { start: 0, end: 2 },
1050        );
1051        assert_eq!(analyzer.get_children_count(&leaf), 0);
1052
1053        // Binary node
1054        let binary = Node::new(
1055            NodeKind::Binary {
1056                op: "+".to_string(),
1057                left: Box::new(leaf.clone()),
1058                right: Box::new(leaf.clone()),
1059            },
1060            SourceLocation { start: 0, end: 5 },
1061        );
1062        assert_eq!(analyzer.get_children_count(&binary), 2);
1063
1064        // Program node
1065        let program = Node::new(
1066            NodeKind::Program { statements: vec![binary] },
1067            SourceLocation { start: 0, end: 5 },
1068        );
1069        assert_eq!(analyzer.get_children_count(&program), 1);
1070    }
1071
1072    #[test]
1073    fn test_reuse_config_defaults() {
1074        let config = ReuseConfig::default();
1075        assert_eq!(config.min_confidence, 0.75);
1076        assert_eq!(config.max_position_shift, 1000);
1077        assert!(config.aggressive_structural_matching);
1078        assert!(config.enable_content_reuse);
1079        assert_eq!(config.max_analysis_depth, 10);
1080    }
1081
1082    #[test]
1083    fn test_node_compatibility_for_content_update() {
1084        let analyzer = AdvancedReuseAnalyzer::new();
1085
1086        let num1 = Node::new(
1087            NodeKind::Number { value: "42".to_string() },
1088            SourceLocation { start: 0, end: 2 },
1089        );
1090
1091        let num2 = Node::new(
1092            NodeKind::Number { value: "99".to_string() },
1093            SourceLocation { start: 0, end: 2 },
1094        );
1095
1096        let str1 = Node::new(
1097            NodeKind::String { value: "hello".to_string(), interpolated: false },
1098            SourceLocation { start: 0, end: 7 },
1099        );
1100
1101        // Same type nodes should be compatible
1102        assert!(analyzer.are_compatible_for_content_update(&num1, &num2));
1103
1104        // Different type nodes should not be compatible
1105        assert!(!analyzer.are_compatible_for_content_update(&num1, &str1));
1106    }
1107
1108    #[test]
1109    fn test_identify_affected_nodes_marks_only_overlapping_regions() {
1110        let mut analyzer = AdvancedReuseAnalyzer::new();
1111        let tree = Node::new(
1112            NodeKind::Program {
1113                statements: vec![
1114                    Node::new(
1115                        NodeKind::Number { value: "1".to_string() },
1116                        SourceLocation { start: 0, end: 1 },
1117                    ),
1118                    Node::new(
1119                        NodeKind::Number { value: "2".to_string() },
1120                        SourceLocation { start: 20, end: 21 },
1121                    ),
1122                ],
1123            },
1124            SourceLocation { start: 0, end: 21 },
1125        );
1126
1127        let mut edits = EditSet::new();
1128        edits.add(perl_parser_core::edit::Edit::new(
1129            0,
1130            2,
1131            2,
1132            Position::new(0, 0, 0),
1133            Position::new(2, 0, 2),
1134            Position::new(2, 0, 2),
1135        ));
1136
1137        analyzer.identify_affected_nodes(&tree, &edits);
1138
1139        assert!(analyzer.affected_nodes.contains(&0));
1140        assert!(!analyzer.affected_nodes.contains(&20));
1141    }
1142
1143    // --- map_old_position_to_new unit tests ---
1144
1145    fn make_edit(start: usize, old_end: usize, new_end: usize) -> perl_parser_core::edit::Edit {
1146        perl_parser_core::edit::Edit::new(
1147            start,
1148            old_end,
1149            new_end,
1150            Position::new(start, 0, start as u32),
1151            Position::new(old_end, 0, old_end as u32),
1152            Position::new(new_end, 0, new_end as u32),
1153        )
1154    }
1155
1156    #[test]
1157    fn test_map_position_before_edit_unchanged() {
1158        let analyzer = AdvancedReuseAnalyzer::new();
1159        let mut edits = EditSet::new();
1160        edits.add(make_edit(10, 12, 14));
1161        // old_pos=5 is before edit start (10) — no shift applied
1162        assert_eq!(analyzer.map_old_position_to_new(5, &edits), 5);
1163    }
1164
1165    #[test]
1166    fn test_map_position_after_single_edit_shifted() {
1167        let analyzer = AdvancedReuseAnalyzer::new();
1168        let mut edits = EditSet::new();
1169        edits.add(make_edit(8, 10, 11)); // "10"→"100": shift +1
1170        // old_pos=13 should map to 14 (shifted by +1)
1171        assert_eq!(analyzer.map_old_position_to_new(13, &edits), 14);
1172    }
1173
1174    #[test]
1175    fn test_map_position_accumulates_two_shifts() {
1176        let analyzer = AdvancedReuseAnalyzer::new();
1177        let mut edits = EditSet::new();
1178        edits.add(make_edit(8, 10, 11)); // +1
1179        edits.add(make_edit(24, 26, 27)); // +1
1180        // old_pos=30 (past both edits) should shift by +2
1181        assert_eq!(analyzer.map_old_position_to_new(30, &edits), 32);
1182    }
1183
1184    /// old_pos at edit.start_byte is inside the edit region — returns new_end_byte.
1185    ///
1186    /// The previous `consumed_shift` formulation could incorrectly apply a shift
1187    /// for this case instead of detecting the position as inside the edit.
1188    #[test]
1189    fn test_map_position_at_edit_start_returns_new_end() {
1190        let analyzer = AdvancedReuseAnalyzer::new();
1191        let mut edits = EditSet::new();
1192        // Edit covers [10, 20) → new_end_byte = 15 (shrink)
1193        edits.add(make_edit(10, 20, 15));
1194        // old_pos=10 satisfies start_byte(10) <= old_pos(10) < old_end_byte(20)
1195        assert_eq!(analyzer.map_old_position_to_new(10, &edits), 15);
1196    }
1197
1198    #[test]
1199    fn test_map_position_at_edit_old_end_is_shifted() {
1200        let analyzer = AdvancedReuseAnalyzer::new();
1201        let mut edits = EditSet::new();
1202        edits.add(make_edit(10, 20, 25)); // +5 shift
1203        // old_pos=20 is at old_end_byte (exclusive boundary) — shifted by +5
1204        assert_eq!(analyzer.map_old_position_to_new(20, &edits), 25);
1205    }
1206
1207    #[test]
1208    fn test_map_position_inside_edit_returns_new_end() {
1209        let analyzer = AdvancedReuseAnalyzer::new();
1210        let mut edits = EditSet::new();
1211        edits.add(make_edit(10, 30, 20)); // big deletion
1212        assert_eq!(analyzer.map_old_position_to_new(15, &edits), 20);
1213    }
1214
1215    #[test]
1216    fn test_map_position_between_two_edits_shifted_by_first_only() {
1217        let analyzer = AdvancedReuseAnalyzer::new();
1218        let mut edits = EditSet::new();
1219        edits.add(make_edit(5, 7, 8)); // shift +1 at [5,7)
1220        edits.add(make_edit(20, 22, 23)); // shift +1 at [20,22)
1221        // old_pos=10 is between the two edits — shifted only by first (+1)
1222        assert_eq!(analyzer.map_old_position_to_new(10, &edits), 11);
1223    }
1224
1225    // --- try_register_match unit tests ---
1226
1227    #[test]
1228    fn test_try_register_match_rejects_duplicate_new_pos() {
1229        let analyzer = AdvancedReuseAnalyzer::new();
1230        let mut reuse_map = HashMap::new();
1231
1232        let first = analyzer.try_register_match(&mut reuse_map, 0, 10, ReuseType::Direct, 0.9);
1233        assert!(first, "first registration should succeed");
1234
1235        let dup = analyzer.try_register_match(&mut reuse_map, 5, 10, ReuseType::Direct, 0.95);
1236        assert!(!dup, "second registration for same new_pos must be rejected");
1237        // The map still has exactly one entry — old_pos=0 holds the claim
1238        assert_eq!(reuse_map.len(), 1);
1239        assert_eq!(reuse_map[&0].target_position, 10);
1240    }
1241
1242    #[test]
1243    fn test_try_register_match_distinct_new_positions_both_succeed() {
1244        let analyzer = AdvancedReuseAnalyzer::new();
1245        let mut reuse_map = HashMap::new();
1246
1247        assert!(analyzer.try_register_match(&mut reuse_map, 0, 10, ReuseType::Direct, 0.9));
1248        assert!(
1249            analyzer.try_register_match(&mut reuse_map, 5, 20, ReuseType::PositionShift, 0.85,)
1250        );
1251        assert_eq!(reuse_map.len(), 2);
1252    }
1253}
perl_parser/incremental/incremental_advanced_reuse.rs

perl_parser/incremental/
incremental_advanced_reuse.rs