Skip to main content

cyrs_syntax/
edit.rs

1//! Tree-edit primitives for incremental reparse (cy-zv0, spec §11).
2//!
3//! # Scope
4//!
5//! This module exposes a [`TextEdit`] value type plus an
6//! [`incremental_reparse`] entry point shaped so downstream crates
7//! (`cyrs-db`) can route document edits through a single API. The name
8//! `incremental_reparse` is aspirational: the current implementation is a
9//! **whole-file reparse fallback** that reconstructs the full source from
10//! the old tree and calls [`crate::parse`] on the result. The API shape is
11//! designed so a future "smart" path can slot underneath without breaking
12//! callers.
13//!
14//! # Why an API-first tranche
15//!
16//! Rowan supports lossless green-tree splicing in principle
17//! (`SyntaxNode::replace_with`, `GreenNode::replace_child`), but a
18//! production-quality incremental reparse needs:
19//!
20//! 1. A re-lex boundary sniff so edits inside trivia don't trigger a parser
21//!    re-entry.
22//! 2. A minimal sub-tree identification that is safe across clause
23//!    boundaries (an edit that deletes `MATCH` must invalidate the
24//!    enclosing statement, not just the token).
25//! 3. Error-recovery reconciliation so an edit that introduces or heals a
26//!    syntax error produces a tree whose error set matches a full reparse.
27//!
28//! Items 1–3 are a research-sized tranche. Landing the API + whole-file
29//! fallback lets downstream crates migrate onto `Database::edit_file`
30//! (see `cyrs-db`) today; the smart path can then land in a follow-up
31//! bead without touching any caller.
32//!
33//! # Future smart path
34//!
35//! When the `incremental` feature (defaulted-on) is enabled, a future
36//! implementation of [`incremental_reparse`] may short-circuit to a
37//! sub-tree reparse. Consumers must not rely on either the slow or fast
38//! path: the invariant is that the returned tree is byte-equal to
39//! `parse(new_text).syntax()` for some canonical `new_text` derived from
40//! `old_tree` + `edit`.
41//!
42//! # Invariants
43//!
44//! - `incremental_reparse(old_tree, edit)` produces a [`Parse`] whose
45//!   `syntax().to_string()` equals the new source text.
46//! - The call is infallible: malformed UTF-8 cannot enter because
47//!   [`TextEdit::replace`] takes `impl Into<String>` and
48//!   [`TextEdit::apply`] concatenates bytes at char boundaries.
49//! - `edit.range` must lie inside the old source; out-of-range offsets
50//!   saturate to the source length (matching `String::replace_range`'s
51//!   documented behaviour).
52
53use rowan::NodeOrToken;
54use text_size::{TextRange, TextSize};
55
56use crate::{Parse, SyntaxKind, SyntaxNode, parse};
57
58/// A single-range text edit.
59///
60/// Shaped to mirror LSP `TextEdit` / rust-analyzer's `TextEdit`: a byte
61/// range inside the *old* source text plus the UTF-8 replacement string.
62///
63/// # Construction
64///
65/// Use [`TextEdit::replace`] for a generic range replacement, or
66/// [`TextEdit::insert`] for a zero-length insertion at a single offset.
67///
68/// # Coordinate space
69///
70/// The `range` is in **byte** offsets over the old source, not characters
71/// and not LSP UTF-16 columns. Callers that start from LSP ranges must
72/// translate first (see [`crate::LineIndex`]).
73#[derive(Debug, Clone, PartialEq, Eq)]
74pub struct TextEdit {
75    /// Byte range (inside the old source) that is replaced.
76    pub range: TextRange,
77    /// UTF-8 replacement text. Empty string = deletion.
78    pub replacement: String,
79}
80
81impl TextEdit {
82    /// Build a replace-range edit.
83    ///
84    /// `range` is in **byte** offsets over the *old* source text. The
85    /// replacement is owned to keep the edit value trivially movable.
86    #[must_use]
87    pub fn replace(range: TextRange, replacement: impl Into<String>) -> Self {
88        Self {
89            range,
90            replacement: replacement.into(),
91        }
92    }
93
94    /// Build a pure insertion at `offset`.
95    #[must_use]
96    pub fn insert(offset: TextSize, text: impl Into<String>) -> Self {
97        Self {
98            range: TextRange::empty(offset),
99            replacement: text.into(),
100        }
101    }
102
103    /// Apply this edit to `src`, returning the resulting text.
104    ///
105    /// Offsets that exceed `src.len()` are clamped to the end of the
106    /// source (matching `String::replace_range`'s implicit behaviour).
107    /// Both endpoints of `range` are rounded *down* to the previous
108    /// UTF-8 char boundary if they fall in the middle of a multi-byte
109    /// sequence; callers feeding ASCII-only Cypher never hit this path.
110    #[must_use]
111    pub fn apply(&self, src: &str) -> String {
112        let len = src.len();
113        let start = usize::from(self.range.start()).min(len);
114        let end = usize::from(self.range.end()).min(len).max(start);
115
116        // Snap to char boundaries defensively so we never slice through a
117        // multi-byte sequence. ASCII-only inputs (the hot path for the
118        // Cypher corpus) take zero iterations of the inner loops.
119        let mut s = start;
120        while s > 0 && !src.is_char_boundary(s) {
121            s -= 1;
122        }
123        let mut e = end;
124        while e < len && !src.is_char_boundary(e) {
125            e += 1;
126        }
127
128        let mut out = String::with_capacity(len - (e - s) + self.replacement.len());
129        out.push_str(&src[..s]);
130        out.push_str(&self.replacement);
131        out.push_str(&src[e..]);
132        out
133    }
134}
135
136/// Reparse after applying `edit` to `old_tree`'s source.
137///
138/// # Implementation — smart sub-tree splice with whole-file fallback
139///
140/// When the `incremental` feature is enabled (default-on, cy-li5):
141///
142/// 1. Locate the smallest `STATEMENT` node in `old_tree` that **fully
143///    contains** `edit.range` via [`rowan::SyntaxNode::covering_element`]
144///    and an upward walk to the nearest `STATEMENT` ancestor.
145/// 2. Reconstruct the new text for that statement's span by stitching
146///    `old_text[stmt.start..edit.start] + edit.replacement +
147///    old_text[edit.end..stmt.end]`.
148/// 3. Lex the candidate statement text in isolation. If a top-level `;`
149///    or `UNION` appears inside the new text, the edit changed the
150///    statement count — bail to whole-file.
151/// 4. Parse the candidate text as a wrapped source-file, extract the
152///    `STATEMENT` green sub-tree, and splice it in via
153///    [`rowan::SyntaxNode::replace_with`].
154/// 5. Re-derive errors by full re-parse of the new source. This step
155///    keeps the public-API invariant (errors match what `parse(new_src)`
156///    would produce) honest. A future tranche may incrementally
157///    reconcile errors, but that is *not* a cy-li5 deliverable —
158///    correctness gates the optimization.
159///
160/// If any of the bail conditions trip (no enclosing STATEMENT, edit
161/// straddles a `;`, top-level separator introduced/removed, candidate
162/// text fails to parse to a single STATEMENT), the implementation falls
163/// back to a whole-file [`parse`]. The whole-file path is also taken
164/// unconditionally when the `incremental` feature is disabled — that
165/// behaviour is preserved as the slow-but-always-correct A/B baseline
166/// (cy-zv0).
167///
168/// # Caveat — bench observability
169///
170/// The `bench_incremental_edit` 2k/1k ratio gate is driven by
171/// `Database::edit_file`, which today reduces this function's return
172/// value to its `syntax().to_string()` and feeds the string back into
173/// Salsa. As a result, the green-tree splice savings *inside* this
174/// function are not yet observable end-to-end at the bench. Threading
175/// the precomputed [`Parse`] into Salsa as a memo is a separate
176/// follow-up tranche; the cy-li5 acceptance criterion that the bench
177/// ratio drops below 1.5× depends on that wiring landing.
178#[must_use]
179pub fn incremental_reparse(old_tree: &SyntaxNode, edit: &TextEdit) -> Parse {
180    let old_src = old_tree.to_string();
181    let new_src = edit.apply(&old_src);
182
183    #[cfg(feature = "incremental")]
184    {
185        if let Some(parsed) = try_incremental_splice(old_tree, &old_src, edit, &new_src) {
186            return parsed;
187        }
188    }
189
190    parse(&new_src)
191}
192
193/// Smart-path attempt: splice a freshly-parsed STATEMENT sub-tree into
194/// `old_tree` and re-derive errors. Returns `None` when any bail
195/// condition trips; the caller then falls back to a whole-file reparse.
196///
197/// The function is `#[cfg(feature = "incremental")]`-gated so disabling
198/// the feature keeps the binary identical to the cy-zv0 fallback.
199#[cfg(feature = "incremental")]
200fn try_incremental_splice(
201    old_tree: &SyntaxNode,
202    old_src: &str,
203    edit: &TextEdit,
204    new_src: &str,
205) -> Option<Parse> {
206    use crate::lexer::lex;
207
208    // 1. Find the smallest enclosing STATEMENT node.
209    //
210    // `covering_element` returns the smallest element fully containing
211    // the range. We walk upward until we hit a STATEMENT (or fall off).
212    let edit_range = edit.range;
213    let stmt = covering_statement(old_tree, edit_range)?;
214    let stmt_range = stmt.text_range();
215
216    // The covering STATEMENT must strictly contain the edit range — if
217    // the edit touches the leading/trailing `;` separator that lives
218    // *outside* the STATEMENT, we'd lose the separator. (rowan's
219    // covering_element already ensures `stmt_range.contains_range(edit)`
220    // is true, but we re-assert defensively.)
221    if !stmt_range.contains_range(edit_range) {
222        return None;
223    }
224
225    // 2. Stitch the new statement text.
226    let stmt_start = usize::from(stmt_range.start());
227    let stmt_end = usize::from(stmt_range.end());
228    let edit_start = usize::from(edit_range.start()).clamp(stmt_start, stmt_end);
229    let edit_end = usize::from(edit_range.end()).clamp(edit_start, stmt_end);
230    if !old_src.is_char_boundary(stmt_start)
231        || !old_src.is_char_boundary(stmt_end)
232        || !old_src.is_char_boundary(edit_start)
233        || !old_src.is_char_boundary(edit_end)
234    {
235        return None;
236    }
237    let mut new_stmt_text = String::with_capacity(
238        (edit_start - stmt_start) + edit.replacement.len() + (stmt_end - edit_end),
239    );
240    new_stmt_text.push_str(&old_src[stmt_start..edit_start]);
241    new_stmt_text.push_str(&edit.replacement);
242    new_stmt_text.push_str(&old_src[edit_end..stmt_end]);
243
244    // 3. Boundary safety: if the lexed statement text contains a `;` or
245    //    `UNION` keyword (which are statement-count-changing tokens), the
246    //    edit may have introduced a new statement boundary. Bail.
247    let toks = lex(&new_stmt_text);
248    for t in &toks {
249        match t.kind {
250            SyntaxKind::SEMI | SyntaxKind::UNION_KW => return None,
251            _ => {}
252        }
253    }
254
255    // 4. Parse the candidate text and extract a single STATEMENT child.
256    //    The simplest robust route: full `parse` on the candidate text,
257    //    expect exactly one STATEMENT child of the SOURCE_FILE, take its
258    //    green sub-tree. If the candidate text doesn't normalise to a
259    //    single STATEMENT (e.g. empty, leading-junk recovery, multiple
260    //    statements somehow), bail.
261    let cand = parse(&new_stmt_text);
262    let cand_root = cand.syntax();
263    let mut stmt_children = cand_root
264        .children()
265        .filter(|n| n.kind() == SyntaxKind::STATEMENT);
266    let new_stmt = stmt_children.next()?;
267    if stmt_children.next().is_some() {
268        return None;
269    }
270    // The candidate STATEMENT must cover the entire candidate text —
271    // otherwise leading/trailing trivia would be lost across the splice
272    // boundary in ways the simple replace_with can't preserve.
273    if new_stmt.text_range() != cand_root.text_range() {
274        return None;
275    }
276
277    // 5. Splice. `replace_with` rebuilds the green tree along the spine
278    //    only — O(depth × siblings-per-level), not O(file).
279    let new_green_root = stmt.replace_with(new_stmt.green().into_owned());
280
281    // 6. Errors: re-parse the new source to derive a correct error set.
282    //    Note this defeats the splice savings *for the error half* of
283    //    Parse; a future tranche can incrementally reconcile errors by
284    //    keeping a sidecar map. The bench is dominated by tree work,
285    //    not error scanning, so this is the right correctness/cost
286    //    trade-off for cy-li5.
287    let full = parse(new_src);
288
289    // Sanity: the spliced tree's text MUST equal the new source. If it
290    // doesn't, our bail conditions missed something — fall back so we
291    // never violate the API's byte-equivalence invariant.
292    let spliced_root = SyntaxNode::new_root(new_green_root.clone());
293    if spliced_root.text() != new_src {
294        return None;
295    }
296
297    Some(make_parse(new_green_root, full.errors().to_vec()))
298}
299
300/// Walk upward from `covering_element(range)` until we find the smallest
301/// `STATEMENT` ancestor. Returns `None` when no such ancestor exists
302/// (e.g. the edit lies between statements or in the source-file root's
303/// trailing trivia).
304#[cfg(feature = "incremental")]
305fn covering_statement(root: &SyntaxNode, range: TextRange) -> Option<SyntaxNode> {
306    // `covering_element` panics if `range` is outside the root's text.
307    // Clamp defensively to the root's range so out-of-bounds edits go
308    // straight to the fallback rather than panicking.
309    let root_range = root.text_range();
310    if !root_range.contains_range(range) {
311        return None;
312    }
313    let elem = root.covering_element(range);
314    let start_node = match elem {
315        NodeOrToken::Node(n) => n,
316        NodeOrToken::Token(t) => t.parent()?,
317    };
318    let mut cur = Some(start_node);
319    while let Some(n) = cur {
320        if n.kind() == SyntaxKind::STATEMENT {
321            return Some(n);
322        }
323        cur = n.parent();
324    }
325    None
326}
327
328/// Construct a [`Parse`] from raw parts. Lives behind the `incremental`
329/// feature because the fallback path uses `parse(...)` directly and
330/// doesn't need the constructor.
331#[cfg(feature = "incremental")]
332fn make_parse(green: rowan::GreenNode, errors: Vec<crate::SyntaxError>) -> Parse {
333    Parse::from_parts(green, errors)
334}
335
336// ---------------------------------------------------------------------------
337// Tests
338// ---------------------------------------------------------------------------
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343
344    #[test]
345    fn insert_at_middle_preserves_prefix_and_suffix() {
346        let src = "RETURN 1";
347        // Offset 6 is the boundary between "RETURN" and " 1".
348        let edit = TextEdit::insert(TextSize::from(6), "0");
349        let out = edit.apply(src);
350        assert_eq!(out, "RETURN0 1");
351    }
352
353    #[test]
354    fn replace_range() {
355        let src = "RETURN 1";
356        let range = TextRange::new(TextSize::from(7), TextSize::from(8));
357        let edit = TextEdit::replace(range, "42");
358        let out = edit.apply(src);
359        assert_eq!(out, "RETURN 42");
360    }
361
362    #[test]
363    fn delete_range() {
364        let src = "MATCH (n) RETURN n";
365        let range = TextRange::new(TextSize::from(0), TextSize::from(10));
366        let edit = TextEdit::replace(range, "");
367        let out = edit.apply(src);
368        assert_eq!(out, "RETURN n");
369    }
370
371    #[test]
372    fn out_of_range_saturates_to_end() {
373        let src = "RETURN 1";
374        let edit = TextEdit::insert(TextSize::from(999), ";");
375        let out = edit.apply(src);
376        assert_eq!(out, "RETURN 1;");
377    }
378
379    #[test]
380    fn incremental_reparse_roundtrips() {
381        let p = parse("RETURN 1");
382        let root = p.syntax();
383        let edit = TextEdit::replace(TextRange::new(TextSize::from(7), TextSize::from(8)), "42");
384        let np = incremental_reparse(&root, &edit);
385        assert_eq!(np.syntax().to_string(), "RETURN 42");
386        assert!(np.errors().is_empty(), "edit keeps the file parseable");
387    }
388
389    // ------------------------------------------------------------------
390    // cy-li5: smart-path coverage
391    // ------------------------------------------------------------------
392
393    /// Helper: assert that `incremental_reparse` produces a tree that
394    /// matches a fresh whole-file parse in both text and error set, then
395    /// return the resulting Parse so the caller can introspect further.
396    #[cfg(feature = "incremental")]
397    fn assert_equivalent_to_full(old: &SyntaxNode, edit: &TextEdit) -> Parse {
398        let new_src = edit.apply(&old.to_string());
399        let smart = incremental_reparse(old, edit);
400        let full = parse(&new_src);
401        assert_eq!(
402            smart.syntax().to_string(),
403            full.syntax().to_string(),
404            "smart-path text must equal whole-file parse text"
405        );
406        assert_eq!(
407            smart.errors().len(),
408            full.errors().len(),
409            "smart-path error count must equal whole-file ({}); errors = {:?}",
410            full.errors().len(),
411            smart
412                .errors()
413                .iter()
414                .map(|e| &e.message)
415                .collect::<Vec<_>>()
416        );
417        smart
418    }
419
420    /// Edit fully inside a single statement — smart path should hit, and
421    /// the result must agree with whole-file parse in text + error count.
422    #[test]
423    #[cfg(feature = "incremental")]
424    fn smart_path_inside_single_statement() {
425        let src = "MATCH (n) RETURN n;\nMATCH (m) RETURN m;\n";
426        let p = parse(src);
427        assert!(p.errors().is_empty(), "fixture parses clean");
428        // Replace `n` in the FIRST statement's RETURN clause (offset 17).
429        let edit = TextEdit::replace(TextRange::new(TextSize::new(17), TextSize::new(18)), "x");
430        let np = assert_equivalent_to_full(&p.syntax(), &edit);
431        assert_eq!(
432            np.syntax().to_string(),
433            "MATCH (n) RETURN x;\nMATCH (m) RETURN m;\n"
434        );
435    }
436
437    /// Edit at a clause boundary — smart path may take it (the WHERE is
438    /// inside the same STATEMENT) but in either case the result must
439    /// match a whole-file parse.
440    #[test]
441    #[cfg(feature = "incremental")]
442    fn smart_path_clause_boundary_inside_statement() {
443        let src = "MATCH (n) RETURN n;\n";
444        let p = parse(src);
445        // Insert a WHERE clause between the MATCH and the RETURN.
446        let edit = TextEdit::insert(TextSize::new(10), "WHERE n.x = 1 ");
447        let np = assert_equivalent_to_full(&p.syntax(), &edit);
448        assert_eq!(
449            np.syntax().to_string(),
450            "MATCH (n) WHERE n.x = 1 RETURN n;\n"
451        );
452    }
453
454    /// Edit that introduces a new top-level `;` — must bail to whole-file
455    /// (statement count changes). The result must still be a valid CST
456    /// with the right text.
457    #[test]
458    #[cfg(feature = "incremental")]
459    fn smart_path_bails_when_introducing_semicolon() {
460        let src = "MATCH (n) RETURN n";
461        let p = parse(src);
462        // Insert "; MATCH (m) RETURN m" before EOF. The "; " inside the
463        // statement covering element forces the bail.
464        let edit = TextEdit::insert(TextSize::new(18), "; MATCH (m) RETURN m");
465        let np = assert_equivalent_to_full(&p.syntax(), &edit);
466        assert_eq!(
467            np.syntax().to_string(),
468            "MATCH (n) RETURN n; MATCH (m) RETURN m"
469        );
470    }
471
472    /// Edit that introduces a syntax error — smart path or fallback must
473    /// produce a tree with `errors()` matching a whole-file parse, and
474    /// the tree must still be byte-lossless (spec §4.4).
475    #[test]
476    #[cfg(feature = "incremental")]
477    fn smart_path_introduces_syntax_error() {
478        let src = "MATCH (n) RETURN n;\n";
479        let p = parse(src);
480        // Replace `(n)` with `(n` — unclosed paren, syntax error.
481        let edit = TextEdit::replace(TextRange::new(TextSize::new(6), TextSize::new(9)), "(n");
482        let np = assert_equivalent_to_full(&p.syntax(), &edit);
483        assert!(!np.errors().is_empty(), "edit must produce errors");
484        assert_eq!(np.syntax().to_string(), "MATCH (n RETURN n;\n");
485    }
486
487    /// Edit that *heals* an existing syntax error must produce a clean
488    /// tree, verified against whole-file parse equivalence.
489    #[test]
490    #[cfg(feature = "incremental")]
491    fn smart_path_heals_syntax_error() {
492        let src = "MATCH (n RETURN n;\n";
493        let p = parse(src);
494        assert!(
495            !p.errors().is_empty(),
496            "fixture has the unclosed paren error"
497        );
498        // Insert the missing `)` — heal the parse.
499        let edit = TextEdit::insert(TextSize::new(8), ")");
500        let np = assert_equivalent_to_full(&p.syntax(), &edit);
501        assert_eq!(np.syntax().to_string(), "MATCH (n) RETURN n;\n");
502        assert!(np.errors().is_empty(), "heal must produce a clean tree");
503    }
504}