1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
//! Lexer checkpointing for incremental parsing
//!
//! This module provides checkpointing functionality for the Perl lexer,
//! allowing it to save and restore its state for incremental parsing.
use crate::{LexerMode, Position};
use std::fmt;
/// A checkpoint that captures the complete lexer state
#[derive(Debug, Clone, PartialEq)]
pub struct LexerCheckpoint {
/// Current position in the input
pub position: usize,
/// Current lexer mode (`ExpectTerm`, `ExpectOperator`, etc.)
pub mode: LexerMode,
/// Stack for nested delimiters in s{}{} constructs
pub delimiter_stack: Vec<char>,
/// Whether we're inside prototype parens after 'sub'
pub in_prototype: bool,
/// Paren depth to track when we exit prototype
pub prototype_depth: usize,
/// Whether we just saw 'sub' and are waiting for a possible prototype
pub after_sub: bool,
/// Whether we just saw '->' (suppresses s/tr/y as substitution)
pub after_arrow: bool,
/// Depth of hash-subscript brace nesting.
/// When > 0, suppresses quote-op detection inside hash subscripts/slices.
pub hash_brace_depth: usize,
/// Whether the lexer just emitted a complete $var/@var/%var token.
/// Used by the `{` handler to distinguish hash subscript openers from block openers.
pub after_var_subscript: bool,
/// Depth of open parentheses (used to guard heredoc vs bitshift disambiguation)
pub paren_depth: usize,
/// Current position with line/column tracking
pub current_pos: Position,
/// Additional context for complex states
pub context: CheckpointContext,
}
/// Additional context that may be needed for certain lexer states
#[derive(Debug, Clone, PartialEq)]
pub enum CheckpointContext {
/// Normal lexing
Normal,
/// Inside a heredoc (tracks the terminator)
Heredoc { terminator: String, is_interpolated: bool },
/// Inside a format body
Format { start_position: usize },
/// Inside a regex or substitution
Regex { delimiter: char, flags_position: Option<usize> },
/// Inside a quote-like operator
QuoteLike { operator: String, delimiter: char, is_paired: bool },
}
impl LexerCheckpoint {
/// Create a new checkpoint with default values
pub fn new() -> Self {
Self {
position: 0,
mode: LexerMode::ExpectTerm,
delimiter_stack: Vec::new(),
in_prototype: false,
prototype_depth: 0,
after_sub: false,
after_arrow: false,
hash_brace_depth: 0,
after_var_subscript: false,
paren_depth: 0,
current_pos: Position::start(),
context: CheckpointContext::Normal,
}
}
/// Create a checkpoint at a specific position
pub fn at_position(position: usize) -> Self {
Self { position, ..Self::new() }
}
/// Check if this checkpoint is at the start of input
pub fn is_at_start(&self) -> bool {
self.position == 0
}
/// Calculate the difference between two checkpoints
pub fn diff(&self, other: &Self) -> CheckpointDiff {
CheckpointDiff {
position_delta: self.position as isize - other.position as isize,
mode_changed: self.mode != other.mode,
delimiter_stack_changed: self.delimiter_stack != other.delimiter_stack,
prototype_state_changed: self.in_prototype != other.in_prototype
|| self.prototype_depth != other.prototype_depth
|| self.after_sub != other.after_sub
|| self.after_arrow != other.after_arrow
|| self.hash_brace_depth != other.hash_brace_depth
|| self.after_var_subscript != other.after_var_subscript
|| self.paren_depth != other.paren_depth,
context_changed: self.context != other.context,
}
}
/// Apply an edit to this checkpoint
pub fn apply_edit(&mut self, start: usize, old_len: usize, new_len: usize) {
if self.position > start {
if self.position >= start + old_len {
// Checkpoint is after the edit
self.position = self.position - old_len + new_len;
} else {
// Checkpoint is inside the edit - invalidate
self.position = start;
self.mode = LexerMode::ExpectTerm;
self.delimiter_stack.clear();
self.in_prototype = false;
self.prototype_depth = 0;
self.after_sub = false;
self.after_arrow = false;
self.hash_brace_depth = 0;
self.after_var_subscript = false;
self.paren_depth = 0;
self.context = CheckpointContext::Normal;
}
}
// Update position tracking
// In a real implementation, we'd update line/column based on the edit
}
/// Validate that this checkpoint is valid for the given input
pub fn is_valid_for(&self, input: &str) -> bool {
self.position <= input.len()
}
}
impl Default for LexerCheckpoint {
fn default() -> Self {
Self::new()
}
}
impl fmt::Display for LexerCheckpoint {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"Checkpoint@{} mode={:?} delims={} proto={} after_sub={}",
self.position,
self.mode,
self.delimiter_stack.len(),
self.in_prototype,
self.after_sub
)
}
}
/// The difference between two [`LexerCheckpoint`]s, produced by
/// [`LexerCheckpoint::diff`].
#[derive(Debug)]
pub struct CheckpointDiff {
/// Signed byte-offset difference between the two checkpoint positions.
pub position_delta: isize,
/// Whether the lexer mode (term vs. operator) changed.
pub mode_changed: bool,
/// Whether the nested delimiter stack differs.
pub delimiter_stack_changed: bool,
/// Whether any prototype-tracking state differs.
pub prototype_state_changed: bool,
/// Whether the [`CheckpointContext`] variant changed.
pub context_changed: bool,
}
impl CheckpointDiff {
/// Check if any state changed besides position
pub fn has_state_changes(&self) -> bool {
self.mode_changed
|| self.delimiter_stack_changed
|| self.prototype_state_changed
|| self.context_changed
}
}
/// Trait for types that support checkpointing
pub trait Checkpointable {
/// Create a checkpoint of the current state
fn checkpoint(&self) -> LexerCheckpoint;
/// Restore state from a checkpoint
fn restore(&mut self, checkpoint: &LexerCheckpoint);
/// Check if we can restore to a given checkpoint
fn can_restore(&self, checkpoint: &LexerCheckpoint) -> bool;
}
/// A checkpoint cache for efficient incremental parsing
pub struct CheckpointCache {
/// Cached checkpoints at various positions
checkpoints: Vec<(usize, LexerCheckpoint)>,
/// Maximum number of checkpoints to cache
max_checkpoints: usize,
}
impl CheckpointCache {
/// Create a new checkpoint cache
pub fn new(max_checkpoints: usize) -> Self {
Self { checkpoints: Vec::new(), max_checkpoints }
}
/// Add a checkpoint to the cache
pub fn add(&mut self, checkpoint: LexerCheckpoint) {
let position = checkpoint.position;
// Remove any existing checkpoint at this position
self.checkpoints.retain(|(pos, _)| *pos != position);
// Add the new checkpoint
self.checkpoints.push((position, checkpoint));
// Sort by position
self.checkpoints.sort_by_key(|(pos, _)| *pos);
// Trim to max size
if self.checkpoints.len() > self.max_checkpoints {
// Keep checkpoints evenly distributed
let total = self.checkpoints.len();
let step = total as f64 / self.max_checkpoints as f64;
let mut kept = Vec::new();
for i in 0..self.max_checkpoints {
let idx = (i as f64 * step) as usize;
if idx < total {
kept.push(self.checkpoints[idx].clone());
}
}
self.checkpoints = kept;
}
}
/// Find the nearest checkpoint at or before a given position.
///
/// Uses binary search over the sorted `checkpoints` vector (invariant maintained by
/// [`Self::add`]) for O(log N) rather than the previous O(N) linear scan. This is
/// important when the checkpoint limit is large (50+) for big documents (#2080).
pub fn find_before(&self, position: usize) -> Option<&LexerCheckpoint> {
// `partition_point` returns the index of the first element for which the
// predicate is false (i.e. the first entry with pos > position).
// The entry just before that index is the last one with pos <= position.
let idx = self.checkpoints.partition_point(|(pos, _)| *pos <= position);
if idx == 0 { None } else { self.checkpoints.get(idx - 1).map(|(_, cp)| cp) }
}
/// Clear all cached checkpoints
pub fn clear(&mut self) {
self.checkpoints.clear();
}
/// Apply an edit to all cached checkpoints
pub fn apply_edit(&mut self, start: usize, old_len: usize, new_len: usize) {
// Update all checkpoints
for (pos, checkpoint) in &mut self.checkpoints {
checkpoint.apply_edit(start, old_len, new_len);
*pos = checkpoint.position;
}
// Remove invalid checkpoints
self.checkpoints
.retain(|(_, cp)| !matches!(cp.context, CheckpointContext::Normal) || cp.position > 0);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_checkpoint_creation() {
let cp = LexerCheckpoint::new();
assert_eq!(cp.position, 0);
assert_eq!(cp.mode, LexerMode::ExpectTerm);
assert!(cp.delimiter_stack.is_empty());
}
#[test]
fn test_checkpoint_diff() {
let cp1 = LexerCheckpoint::at_position(10);
let mut cp2 = cp1.clone();
cp2.position = 20;
cp2.mode = LexerMode::ExpectOperator;
let diff = cp2.diff(&cp1);
assert_eq!(diff.position_delta, 10);
assert!(diff.mode_changed);
assert!(!diff.delimiter_stack_changed);
}
#[test]
fn test_checkpoint_edit() {
let mut cp = LexerCheckpoint::at_position(50);
// Edit before checkpoint
cp.apply_edit(10, 5, 10);
assert_eq!(cp.position, 55); // Shifted by +5
// Edit after checkpoint
let mut cp2 = LexerCheckpoint::at_position(50);
cp2.apply_edit(60, 10, 5);
assert_eq!(cp2.position, 50); // No change
// Edit containing checkpoint
let mut cp3 = LexerCheckpoint::at_position(50);
cp3.apply_edit(45, 10, 5);
assert_eq!(cp3.position, 45); // Reset to edit start
}
#[test]
fn test_checkpoint_cache() -> std::result::Result<(), Box<dyn std::error::Error>> {
let mut cache = CheckpointCache::new(3);
// Add checkpoints
cache.add(LexerCheckpoint::at_position(10));
cache.add(LexerCheckpoint::at_position(20));
cache.add(LexerCheckpoint::at_position(30));
cache.add(LexerCheckpoint::at_position(40));
// Should keep 3 evenly distributed
assert_eq!(cache.checkpoints.len(), 3);
// Find nearest before position 25
let cp = cache.find_before(25).ok_or("Expected checkpoint before position 25")?;
assert_eq!(cp.position, 20);
Ok(())
}
/// Verify `find_before` uses sorted-invariant binary search (O(log N)).
///
/// This test confirms correctness for: exact match, between two entries,
/// before first entry, and after last entry.
#[test]
fn test_find_before_binary_search() -> std::result::Result<(), Box<dyn std::error::Error>> {
let mut cache = CheckpointCache::new(50);
for pos in [10usize, 20, 30, 40, 50] {
cache.add(LexerCheckpoint::at_position(pos));
}
// Exact match: should return that checkpoint
let cp = cache.find_before(30).ok_or("find_before(30) should hit")?;
assert_eq!(cp.position, 30, "exact match should return the entry at 30");
// Between entries: should return the lower one
let cp = cache.find_before(25).ok_or("find_before(25) should hit")?;
assert_eq!(cp.position, 20, "between 20 and 30 should return 20");
// After all entries: should return the last
let cp = cache.find_before(100).ok_or("find_before(100) should hit")?;
assert_eq!(cp.position, 50, "after last entry should return 50");
// Before first entry: should return None
let cp = cache.find_before(5);
assert!(cp.is_none(), "before first entry (5 < 10) should return None");
Ok(())
}
/// Verify that CheckpointedIncrementalParser uses 50 checkpoints (Gap B).
#[test]
fn test_checkpoint_cache_capacity_50() {
// A cache of capacity 50 must not evict until we exceed 50.
let mut cache = CheckpointCache::new(50);
for i in 0..50usize {
cache.add(LexerCheckpoint::at_position(i * 100));
}
assert_eq!(
cache.checkpoints.len(),
50,
"a capacity-50 cache must hold exactly 50 checkpoints before eviction"
);
// Adding one more should evict down to 50, not to 10.
cache.add(LexerCheckpoint::at_position(5000));
assert_eq!(
cache.checkpoints.len(),
50,
"eviction must keep exactly max_checkpoints entries"
);
}
}