cyrs_syntax/edit.rs
1//! Tree-edit primitives for incremental reparse (cy-zv0, spec §11).
2//!
3//! # Scope
4//!
5//! This module exposes a [`TextEdit`] value type plus an
6//! [`incremental_reparse`] entry point shaped so downstream crates
7//! (`cyrs-db`) can route document edits through a single API. The name
8//! `incremental_reparse` is aspirational: the current implementation is a
9//! **whole-file reparse fallback** that reconstructs the full source from
10//! the old tree and calls [`crate::parse`] on the result. The API shape is
11//! designed so a future "smart" path can slot underneath without breaking
12//! callers.
13//!
14//! # Why an API-first tranche
15//!
16//! Rowan supports lossless green-tree splicing in principle
17//! (`SyntaxNode::replace_with`, `GreenNode::replace_child`), but a
18//! production-quality incremental reparse needs:
19//!
20//! 1. A re-lex boundary sniff so edits inside trivia don't trigger a parser
21//! re-entry.
22//! 2. A minimal sub-tree identification that is safe across clause
23//! boundaries (an edit that deletes `MATCH` must invalidate the
24//! enclosing statement, not just the token).
25//! 3. Error-recovery reconciliation so an edit that introduces or heals a
26//! syntax error produces a tree whose error set matches a full reparse.
27//!
28//! Items 1–3 are a research-sized tranche. Landing the API + whole-file
29//! fallback lets downstream crates migrate onto `Database::edit_file`
30//! (see `cyrs-db`) today; the smart path can then land in a follow-up
31//! bead without touching any caller.
32//!
33//! # Future smart path
34//!
35//! When the `incremental` feature (defaulted-on) is enabled, a future
36//! implementation of [`incremental_reparse`] may short-circuit to a
37//! sub-tree reparse. Consumers must not rely on either the slow or fast
38//! path: the invariant is that the returned tree is byte-equal to
39//! `parse(new_text).syntax()` for some canonical `new_text` derived from
40//! `old_tree` + `edit`.
41//!
42//! # Invariants
43//!
44//! - `incremental_reparse(old_tree, edit)` produces a [`Parse`] whose
45//! `syntax().to_string()` equals the new source text.
46//! - The call is infallible: malformed UTF-8 cannot enter because
47//! [`TextEdit::replace`] takes `impl Into<String>` and
48//! [`TextEdit::apply`] concatenates bytes at char boundaries.
49//! - `edit.range` must lie inside the old source; out-of-range offsets
50//! saturate to the source length (matching `String::replace_range`'s
51//! documented behaviour).
52
53use rowan::NodeOrToken;
54use text_size::{TextRange, TextSize};
55
56use crate::{Parse, SyntaxKind, SyntaxNode, parse};
57
58/// A single-range text edit.
59///
60/// Shaped to mirror LSP `TextEdit` / rust-analyzer's `TextEdit`: a byte
61/// range inside the *old* source text plus the UTF-8 replacement string.
62///
63/// # Construction
64///
65/// Use [`TextEdit::replace`] for a generic range replacement, or
66/// [`TextEdit::insert`] for a zero-length insertion at a single offset.
67///
68/// # Coordinate space
69///
70/// The `range` is in **byte** offsets over the old source, not characters
71/// and not LSP UTF-16 columns. Callers that start from LSP ranges must
72/// translate first (see [`crate::LineIndex`]).
73#[derive(Debug, Clone, PartialEq, Eq)]
74pub struct TextEdit {
75 /// Byte range (inside the old source) that is replaced.
76 pub range: TextRange,
77 /// UTF-8 replacement text. Empty string = deletion.
78 pub replacement: String,
79}
80
81impl TextEdit {
82 /// Build a replace-range edit.
83 ///
84 /// `range` is in **byte** offsets over the *old* source text. The
85 /// replacement is owned to keep the edit value trivially movable.
86 #[must_use]
87 pub fn replace(range: TextRange, replacement: impl Into<String>) -> Self {
88 Self {
89 range,
90 replacement: replacement.into(),
91 }
92 }
93
94 /// Build a pure insertion at `offset`.
95 #[must_use]
96 pub fn insert(offset: TextSize, text: impl Into<String>) -> Self {
97 Self {
98 range: TextRange::empty(offset),
99 replacement: text.into(),
100 }
101 }
102
103 /// Apply this edit to `src`, returning the resulting text.
104 ///
105 /// Offsets that exceed `src.len()` are clamped to the end of the
106 /// source (matching `String::replace_range`'s implicit behaviour).
107 /// Both endpoints of `range` are rounded *down* to the previous
108 /// UTF-8 char boundary if they fall in the middle of a multi-byte
109 /// sequence; callers feeding ASCII-only Cypher never hit this path.
110 #[must_use]
111 pub fn apply(&self, src: &str) -> String {
112 let len = src.len();
113 let start = usize::from(self.range.start()).min(len);
114 let end = usize::from(self.range.end()).min(len).max(start);
115
116 // Snap to char boundaries defensively so we never slice through a
117 // multi-byte sequence. ASCII-only inputs (the hot path for the
118 // Cypher corpus) take zero iterations of the inner loops.
119 let mut s = start;
120 while s > 0 && !src.is_char_boundary(s) {
121 s -= 1;
122 }
123 let mut e = end;
124 while e < len && !src.is_char_boundary(e) {
125 e += 1;
126 }
127
128 let mut out = String::with_capacity(len - (e - s) + self.replacement.len());
129 out.push_str(&src[..s]);
130 out.push_str(&self.replacement);
131 out.push_str(&src[e..]);
132 out
133 }
134}
135
136/// Reparse after applying `edit` to `old_tree`'s source.
137///
138/// # Implementation — smart sub-tree splice with whole-file fallback
139///
140/// When the `incremental` feature is enabled (default-on, cy-li5):
141///
142/// 1. Locate the smallest `STATEMENT` node in `old_tree` that **fully
143/// contains** `edit.range` via [`rowan::SyntaxNode::covering_element`]
144/// and an upward walk to the nearest `STATEMENT` ancestor.
145/// 2. Reconstruct the new text for that statement's span by stitching
146/// `old_text[stmt.start..edit.start] + edit.replacement +
147/// old_text[edit.end..stmt.end]`.
148/// 3. Lex the candidate statement text in isolation. If a top-level `;`
149/// or `UNION` appears inside the new text, the edit changed the
150/// statement count — bail to whole-file.
151/// 4. Parse the candidate text as a wrapped source-file, extract the
152/// `STATEMENT` green sub-tree, and splice it in via
153/// [`rowan::SyntaxNode::replace_with`].
154/// 5. Re-derive errors by full re-parse of the new source. This step
155/// keeps the public-API invariant (errors match what `parse(new_src)`
156/// would produce) honest. A future tranche may incrementally
157/// reconcile errors, but that is *not* a cy-li5 deliverable —
158/// correctness gates the optimization.
159///
160/// If any of the bail conditions trip (no enclosing STATEMENT, edit
161/// straddles a `;`, top-level separator introduced/removed, candidate
162/// text fails to parse to a single STATEMENT), the implementation falls
163/// back to a whole-file [`parse`]. The whole-file path is also taken
164/// unconditionally when the `incremental` feature is disabled — that
165/// behaviour is preserved as the slow-but-always-correct A/B baseline
166/// (cy-zv0).
167///
168/// # Caveat — bench observability
169///
170/// The `bench_incremental_edit` 2k/1k ratio gate is driven by
171/// `Database::edit_file`, which today reduces this function's return
172/// value to its `syntax().to_string()` and feeds the string back into
173/// Salsa. As a result, the green-tree splice savings *inside* this
174/// function are not yet observable end-to-end at the bench. Threading
175/// the precomputed [`Parse`] into Salsa as a memo is a separate
176/// follow-up tranche; the cy-li5 acceptance criterion that the bench
177/// ratio drops below 1.5× depends on that wiring landing.
178#[must_use]
179pub fn incremental_reparse(old_tree: &SyntaxNode, edit: &TextEdit) -> Parse {
180 let old_src = old_tree.to_string();
181 let new_src = edit.apply(&old_src);
182
183 #[cfg(feature = "incremental")]
184 {
185 if let Some(parsed) = try_incremental_splice(old_tree, &old_src, edit, &new_src) {
186 return parsed;
187 }
188 }
189
190 parse(&new_src)
191}
192
193/// Smart-path attempt: splice a freshly-parsed STATEMENT sub-tree into
194/// `old_tree` and re-derive errors. Returns `None` when any bail
195/// condition trips; the caller then falls back to a whole-file reparse.
196///
197/// The function is `#[cfg(feature = "incremental")]`-gated so disabling
198/// the feature keeps the binary identical to the cy-zv0 fallback.
199#[cfg(feature = "incremental")]
200fn try_incremental_splice(
201 old_tree: &SyntaxNode,
202 old_src: &str,
203 edit: &TextEdit,
204 new_src: &str,
205) -> Option<Parse> {
206 use crate::lexer::lex;
207
208 // 1. Find the smallest enclosing STATEMENT node.
209 //
210 // `covering_element` returns the smallest element fully containing
211 // the range. We walk upward until we hit a STATEMENT (or fall off).
212 let edit_range = edit.range;
213 let stmt = covering_statement(old_tree, edit_range)?;
214 let stmt_range = stmt.text_range();
215
216 // The covering STATEMENT must strictly contain the edit range — if
217 // the edit touches the leading/trailing `;` separator that lives
218 // *outside* the STATEMENT, we'd lose the separator. (rowan's
219 // covering_element already ensures `stmt_range.contains_range(edit)`
220 // is true, but we re-assert defensively.)
221 if !stmt_range.contains_range(edit_range) {
222 return None;
223 }
224
225 // 2. Stitch the new statement text.
226 let stmt_start = usize::from(stmt_range.start());
227 let stmt_end = usize::from(stmt_range.end());
228 let edit_start = usize::from(edit_range.start()).clamp(stmt_start, stmt_end);
229 let edit_end = usize::from(edit_range.end()).clamp(edit_start, stmt_end);
230 if !old_src.is_char_boundary(stmt_start)
231 || !old_src.is_char_boundary(stmt_end)
232 || !old_src.is_char_boundary(edit_start)
233 || !old_src.is_char_boundary(edit_end)
234 {
235 return None;
236 }
237 let mut new_stmt_text = String::with_capacity(
238 (edit_start - stmt_start) + edit.replacement.len() + (stmt_end - edit_end),
239 );
240 new_stmt_text.push_str(&old_src[stmt_start..edit_start]);
241 new_stmt_text.push_str(&edit.replacement);
242 new_stmt_text.push_str(&old_src[edit_end..stmt_end]);
243
244 // 3. Boundary safety: if the lexed statement text contains a `;` or
245 // `UNION` keyword (which are statement-count-changing tokens), the
246 // edit may have introduced a new statement boundary. Bail.
247 let toks = lex(&new_stmt_text);
248 for t in &toks {
249 match t.kind {
250 SyntaxKind::SEMI | SyntaxKind::UNION_KW => return None,
251 _ => {}
252 }
253 }
254
255 // 4. Parse the candidate text and extract a single STATEMENT child.
256 // The simplest robust route: full `parse` on the candidate text,
257 // expect exactly one STATEMENT child of the SOURCE_FILE, take its
258 // green sub-tree. If the candidate text doesn't normalise to a
259 // single STATEMENT (e.g. empty, leading-junk recovery, multiple
260 // statements somehow), bail.
261 let cand = parse(&new_stmt_text);
262 let cand_root = cand.syntax();
263 let mut stmt_children = cand_root
264 .children()
265 .filter(|n| n.kind() == SyntaxKind::STATEMENT);
266 let new_stmt = stmt_children.next()?;
267 if stmt_children.next().is_some() {
268 return None;
269 }
270 // The candidate STATEMENT must cover the entire candidate text —
271 // otherwise leading/trailing trivia would be lost across the splice
272 // boundary in ways the simple replace_with can't preserve.
273 if new_stmt.text_range() != cand_root.text_range() {
274 return None;
275 }
276
277 // 5. Splice. `replace_with` rebuilds the green tree along the spine
278 // only — O(depth × siblings-per-level), not O(file).
279 let new_green_root = stmt.replace_with(new_stmt.green().into_owned());
280
281 // 6. Errors: re-parse the new source to derive a correct error set.
282 // Note this defeats the splice savings *for the error half* of
283 // Parse; a future tranche can incrementally reconcile errors by
284 // keeping a sidecar map. The bench is dominated by tree work,
285 // not error scanning, so this is the right correctness/cost
286 // trade-off for cy-li5.
287 let full = parse(new_src);
288
289 // Sanity: the spliced tree's text MUST equal the new source. If it
290 // doesn't, our bail conditions missed something — fall back so we
291 // never violate the API's byte-equivalence invariant.
292 let spliced_root = SyntaxNode::new_root(new_green_root.clone());
293 if spliced_root.text() != new_src {
294 return None;
295 }
296
297 Some(make_parse(new_green_root, full.errors().to_vec()))
298}
299
300/// Walk upward from `covering_element(range)` until we find the smallest
301/// `STATEMENT` ancestor. Returns `None` when no such ancestor exists
302/// (e.g. the edit lies between statements or in the source-file root's
303/// trailing trivia).
304#[cfg(feature = "incremental")]
305fn covering_statement(root: &SyntaxNode, range: TextRange) -> Option<SyntaxNode> {
306 // `covering_element` panics if `range` is outside the root's text.
307 // Clamp defensively to the root's range so out-of-bounds edits go
308 // straight to the fallback rather than panicking.
309 let root_range = root.text_range();
310 if !root_range.contains_range(range) {
311 return None;
312 }
313 let elem = root.covering_element(range);
314 let start_node = match elem {
315 NodeOrToken::Node(n) => n,
316 NodeOrToken::Token(t) => t.parent()?,
317 };
318 let mut cur = Some(start_node);
319 while let Some(n) = cur {
320 if n.kind() == SyntaxKind::STATEMENT {
321 return Some(n);
322 }
323 cur = n.parent();
324 }
325 None
326}
327
328/// Construct a [`Parse`] from raw parts. Lives behind the `incremental`
329/// feature because the fallback path uses `parse(...)` directly and
330/// doesn't need the constructor.
331#[cfg(feature = "incremental")]
332fn make_parse(green: rowan::GreenNode, errors: Vec<crate::SyntaxError>) -> Parse {
333 Parse::from_parts(green, errors)
334}
335
336// ---------------------------------------------------------------------------
337// Tests
338// ---------------------------------------------------------------------------
339
340#[cfg(test)]
341mod tests {
342 use super::*;
343
344 #[test]
345 fn insert_at_middle_preserves_prefix_and_suffix() {
346 let src = "RETURN 1";
347 // Offset 6 is the boundary between "RETURN" and " 1".
348 let edit = TextEdit::insert(TextSize::from(6), "0");
349 let out = edit.apply(src);
350 assert_eq!(out, "RETURN0 1");
351 }
352
353 #[test]
354 fn replace_range() {
355 let src = "RETURN 1";
356 let range = TextRange::new(TextSize::from(7), TextSize::from(8));
357 let edit = TextEdit::replace(range, "42");
358 let out = edit.apply(src);
359 assert_eq!(out, "RETURN 42");
360 }
361
362 #[test]
363 fn delete_range() {
364 let src = "MATCH (n) RETURN n";
365 let range = TextRange::new(TextSize::from(0), TextSize::from(10));
366 let edit = TextEdit::replace(range, "");
367 let out = edit.apply(src);
368 assert_eq!(out, "RETURN n");
369 }
370
371 #[test]
372 fn out_of_range_saturates_to_end() {
373 let src = "RETURN 1";
374 let edit = TextEdit::insert(TextSize::from(999), ";");
375 let out = edit.apply(src);
376 assert_eq!(out, "RETURN 1;");
377 }
378
379 #[test]
380 fn incremental_reparse_roundtrips() {
381 let p = parse("RETURN 1");
382 let root = p.syntax();
383 let edit = TextEdit::replace(TextRange::new(TextSize::from(7), TextSize::from(8)), "42");
384 let np = incremental_reparse(&root, &edit);
385 assert_eq!(np.syntax().to_string(), "RETURN 42");
386 assert!(np.errors().is_empty(), "edit keeps the file parseable");
387 }
388
389 // ------------------------------------------------------------------
390 // cy-li5: smart-path coverage
391 // ------------------------------------------------------------------
392
393 /// Helper: assert that `incremental_reparse` produces a tree that
394 /// matches a fresh whole-file parse in both text and error set, then
395 /// return the resulting Parse so the caller can introspect further.
396 #[cfg(feature = "incremental")]
397 fn assert_equivalent_to_full(old: &SyntaxNode, edit: &TextEdit) -> Parse {
398 let new_src = edit.apply(&old.to_string());
399 let smart = incremental_reparse(old, edit);
400 let full = parse(&new_src);
401 assert_eq!(
402 smart.syntax().to_string(),
403 full.syntax().to_string(),
404 "smart-path text must equal whole-file parse text"
405 );
406 assert_eq!(
407 smart.errors().len(),
408 full.errors().len(),
409 "smart-path error count must equal whole-file ({}); errors = {:?}",
410 full.errors().len(),
411 smart
412 .errors()
413 .iter()
414 .map(|e| &e.message)
415 .collect::<Vec<_>>()
416 );
417 smart
418 }
419
420 /// Edit fully inside a single statement — smart path should hit, and
421 /// the result must agree with whole-file parse in text + error count.
422 #[test]
423 #[cfg(feature = "incremental")]
424 fn smart_path_inside_single_statement() {
425 let src = "MATCH (n) RETURN n;\nMATCH (m) RETURN m;\n";
426 let p = parse(src);
427 assert!(p.errors().is_empty(), "fixture parses clean");
428 // Replace `n` in the FIRST statement's RETURN clause (offset 17).
429 let edit = TextEdit::replace(TextRange::new(TextSize::new(17), TextSize::new(18)), "x");
430 let np = assert_equivalent_to_full(&p.syntax(), &edit);
431 assert_eq!(
432 np.syntax().to_string(),
433 "MATCH (n) RETURN x;\nMATCH (m) RETURN m;\n"
434 );
435 }
436
437 /// Edit at a clause boundary — smart path may take it (the WHERE is
438 /// inside the same STATEMENT) but in either case the result must
439 /// match a whole-file parse.
440 #[test]
441 #[cfg(feature = "incremental")]
442 fn smart_path_clause_boundary_inside_statement() {
443 let src = "MATCH (n) RETURN n;\n";
444 let p = parse(src);
445 // Insert a WHERE clause between the MATCH and the RETURN.
446 let edit = TextEdit::insert(TextSize::new(10), "WHERE n.x = 1 ");
447 let np = assert_equivalent_to_full(&p.syntax(), &edit);
448 assert_eq!(
449 np.syntax().to_string(),
450 "MATCH (n) WHERE n.x = 1 RETURN n;\n"
451 );
452 }
453
454 /// Edit that introduces a new top-level `;` — must bail to whole-file
455 /// (statement count changes). The result must still be a valid CST
456 /// with the right text.
457 #[test]
458 #[cfg(feature = "incremental")]
459 fn smart_path_bails_when_introducing_semicolon() {
460 let src = "MATCH (n) RETURN n";
461 let p = parse(src);
462 // Insert "; MATCH (m) RETURN m" before EOF. The "; " inside the
463 // statement covering element forces the bail.
464 let edit = TextEdit::insert(TextSize::new(18), "; MATCH (m) RETURN m");
465 let np = assert_equivalent_to_full(&p.syntax(), &edit);
466 assert_eq!(
467 np.syntax().to_string(),
468 "MATCH (n) RETURN n; MATCH (m) RETURN m"
469 );
470 }
471
472 /// Edit that introduces a syntax error — smart path or fallback must
473 /// produce a tree with `errors()` matching a whole-file parse, and
474 /// the tree must still be byte-lossless (spec §4.4).
475 #[test]
476 #[cfg(feature = "incremental")]
477 fn smart_path_introduces_syntax_error() {
478 let src = "MATCH (n) RETURN n;\n";
479 let p = parse(src);
480 // Replace `(n)` with `(n` — unclosed paren, syntax error.
481 let edit = TextEdit::replace(TextRange::new(TextSize::new(6), TextSize::new(9)), "(n");
482 let np = assert_equivalent_to_full(&p.syntax(), &edit);
483 assert!(!np.errors().is_empty(), "edit must produce errors");
484 assert_eq!(np.syntax().to_string(), "MATCH (n RETURN n;\n");
485 }
486
487 /// Edit that *heals* an existing syntax error must produce a clean
488 /// tree, verified against whole-file parse equivalence.
489 #[test]
490 #[cfg(feature = "incremental")]
491 fn smart_path_heals_syntax_error() {
492 let src = "MATCH (n RETURN n;\n";
493 let p = parse(src);
494 assert!(
495 !p.errors().is_empty(),
496 "fixture has the unclosed paren error"
497 );
498 // Insert the missing `)` — heal the parse.
499 let edit = TextEdit::insert(TextSize::new(8), ")");
500 let np = assert_equivalent_to_full(&p.syntax(), &edit);
501 assert_eq!(np.syntax().to_string(), "MATCH (n) RETURN n;\n");
502 assert!(np.errors().is_empty(), "heal must produce a clean tree");
503 }
504}