Skip to main content

gdscript_syntax/
prepass.rs

1//! WS2 — the indentation pre-pass (the highest-risk module).
2//!
3//! GDScript has Python-like significant indentation. This pass consumes the flat
4//! [`RawToken`] stream from the lexer and injects the synthetic, **zero-width**
5//! `Newline`/`Indent`/`Dedent` markers the parser needs to recover block structure,
6//! while leaving every original byte-carrying token (real tokens **and** trivia)
7//! exactly where it was — so the round-trip stays byte-exact.
8//!
9//! Modeled on Godot's own `gdscript_tokenizer.cpp`
10//! (`plans/PHASE-1-IMPLEMENTATION-PLAYBOOK.md` §WS2), **not** tree-sitter's
11//! `scanner.c`. Key engine-faithful choices:
12//! - **Tab width is a flat `tab_size` (default 4)**, `+1` per space — Godot adds a
13//!   flat `tab_size` per tab, not 8-column tab stops.
14//! - **Bracket suppression:** inside `()`/`[]`/`{}` newlines/indentation are not
15//!   significant (a depth counter pauses marker emission).
16//! - **`\` line continuations** are already merged into single `LineContinuation`
17//!   tokens by the lexer, so splitting logical lines on physical newlines joins them
18//!   for free.
19//! - **Blank / comment-only lines keep indentation state** (no spurious `Dedent`),
20//!   so a column-0 comment inside a body never closes the scope.
21//! - **Two distinct diagnostics** (same-line tab+space mix; cross-line deviation from
22//!   the file's first indent character) — both recover, never abort.
23//!
24//! - **Lambda bodies inside brackets** re-enable indentation. Inside `()[]{}`
25//!   indentation is normally suppressed, but a *multiline lambda* body that lives
26//!   inside an open bracket (e.g. `arr.sort_custom(func(a, b):\n\treturn a < b\n)`)
27//!   must still be a block. We mirror Godot's stack-of-stacks: a line that ends with
28//!   `:` while inside brackets opens a fresh indentation context for the lambda body,
29//!   which closes (restoring the bracket-suppressed context) once a later line dedents
30//!   back to the header's column.
31
32use text_size::{TextRange, TextSize};
33
34use crate::SyntaxKind;
35use crate::lexer::RawToken;
36
37/// Godot's default indentation width for a tab character.
38const TAB_SIZE: u32 = 4;
39
40/// A saved indentation context for a lambda body opened inside brackets. When the
41/// lambda's `:` is reached we stash the surrounding indent stack and start a fresh one
42/// based at the header line's column; the body closes once indentation returns to
43/// `base`, restoring `saved_indent_stack`.
44#[derive(Debug, Clone)]
45struct LambdaCtx {
46    saved_indent_stack: Vec<u32>,
47    base: u32,
48    /// The `bracket_depth` the lambda body lives at (the depth inside its enclosing
49    /// bracket). When a closing bracket drops below this, the body ends — even mid-line,
50    /// when the closer trails the last body statement (`call(func(): … last())`).
51    open_bracket_depth: u32,
52}
53
54/// An indentation diagnostic produced while injecting block-structure markers.
55/// Byte-ranged; mapped into a `gdscript-base` `Diagnostic` by the IDE layer.
56#[derive(Debug, Clone, PartialEq, Eq)]
57pub struct IndentDiagnostic {
58    /// The offending leading-whitespace range.
59    pub range: TextRange,
60    /// A human-readable message (mirrors Godot's wording).
61    pub message: String,
62}
63
64/// Which character a line used for its leading indentation.
65#[derive(Debug, Clone, Copy, PartialEq, Eq)]
66enum IndentChar {
67    Tab,
68    Space,
69}
70
71/// Inject `Newline`/`Indent`/`Dedent` markers into the lexer token stream.
72///
73/// Returns the augmented token stream plus any indentation diagnostics. The output is
74/// still lossless: the injected markers are zero-width, and every input token is
75/// preserved in order.
76#[must_use]
77pub fn run(tokens: &[RawToken], src: &str) -> (Vec<RawToken>, Vec<IndentDiagnostic>) {
78    let mut p = PrePass {
79        src,
80        out: Vec::with_capacity(tokens.len() + 16),
81        diags: Vec::new(),
82        indent_stack: vec![0],
83        bracket_depth: 0,
84        indent_char: None,
85        lambda_stack: Vec::new(),
86    };
87    p.run_lines(tokens);
88    (p.out, p.diags)
89}
90
91struct PrePass<'s> {
92    src: &'s str,
93    out: Vec<RawToken>,
94    diags: Vec<IndentDiagnostic>,
95    indent_stack: Vec<u32>,
96    bracket_depth: u32,
97    indent_char: Option<IndentChar>,
98    /// Active lambda-body indentation contexts (innermost last). Non-empty means
99    /// indentation is significant *despite* being inside brackets.
100    lambda_stack: Vec<LambdaCtx>,
101}
102
103impl PrePass<'_> {
104    fn run_lines(&mut self, tokens: &[RawToken]) {
105        // Split the stream into physical lines (each ends at a `NewlinePhys`, or at
106        // EOF). `\`-continuations are already absorbed into `LineContinuation` tokens,
107        // so a continued logical line is naturally one slice here.
108        let mut start = 0usize;
109        let mut i = 0usize;
110        while i < tokens.len() {
111            if tokens[i].kind == SyntaxKind::NewlinePhys {
112                self.line(&tokens[start..=i]);
113                start = i + 1;
114            }
115            i += 1;
116        }
117        if start < tokens.len() {
118            self.line(&tokens[start..]); // trailing line without a final newline
119        }
120        self.finish(src_end(self.src));
121    }
122
123    /// Process one physical line (the slice may end with a `NewlinePhys`).
124    ///
125    /// Indentation is significant when we are outside all brackets **or** inside a
126    /// lambda body opened within brackets. The logical `Newline` is emitted at the
127    /// terminator when we are at bracket depth 0, inside a lambda body, or the line is
128    /// itself a lambda header (its `:` opens a body block).
129    fn line(&mut self, line: &[RawToken]) {
130        // Blank / comment-only lines keep indentation state — copy verbatim, no
131        // markers (this is what stops a column-0 comment from closing a scope). A line
132        // whose only non-trivia content is the newline is blank too, since
133        // `NewlinePhys` is trivia.
134        let Some(first) = line.iter().find(|t| !t.kind.is_trivia()) else {
135            self.copy_verbatim(line);
136            return;
137        };
138        let col = self.column(line);
139        let at = first.range.start();
140
141        // Close any lambda bodies this line has dedented back out of.
142        self.close_lambdas(col, at);
143
144        let in_lambda = !self.lambda_stack.is_empty();
145        let suppressed = !in_lambda && self.bracket_depth > 0;
146
147        // Indentation markers only where indentation is significant.
148        if !suppressed {
149            self.diagnose_indent(line);
150            self.emit_indent_dedent(col, at);
151        }
152
153        // Copy the line's tokens, tracking brackets and the last meaningful token, and
154        // emit a logical Newline at the terminator where appropriate.
155        let mut has_terminator = false;
156        let mut last_meaningful: Option<SyntaxKind> = None;
157        for tok in line {
158            if tok.kind == SyntaxKind::NewlinePhys {
159                has_terminator = true;
160                let opens_lambda =
161                    self.bracket_depth > 0 && last_meaningful == Some(SyntaxKind::Colon);
162                if self.bracket_depth == 0 || in_lambda || opens_lambda {
163                    self.push_marker(SyntaxKind::Newline, tok.range.start());
164                }
165                self.out.push(*tok);
166            } else {
167                // A closing bracket that drops below a lambda body's enclosing depth
168                // ends that body here, even mid-line — emit its `Dedent`s before the
169                // bracket so the parser closes the block at the right place.
170                if matches!(
171                    tok.kind,
172                    SyntaxKind::RParen | SyntaxKind::RBrace | SyntaxKind::RBrack
173                ) && !self.lambda_stack.is_empty()
174                {
175                    let new_depth = self.bracket_depth.saturating_sub(1);
176                    self.close_lambdas_on_bracket(new_depth, tok.range.start());
177                }
178                self.out.push(*tok);
179                self.track_bracket(tok.kind);
180                if !tok.kind.is_trivia() {
181                    last_meaningful = Some(tok.kind);
182                }
183            }
184        }
185        // A final line with content but no trailing newline still terminates a statement.
186        if !has_terminator && (self.bracket_depth == 0 || in_lambda) {
187            self.push_marker(SyntaxKind::Newline, src_end(self.src));
188        }
189
190        // A line that ends with `:` while inside brackets is a lambda header: open a
191        // fresh indentation context for its body, based at this line's column.
192        if self.bracket_depth > 0 && last_meaningful == Some(SyntaxKind::Colon) {
193            let saved = std::mem::replace(&mut self.indent_stack, vec![col]);
194            self.lambda_stack.push(LambdaCtx {
195                saved_indent_stack: saved,
196                base: col,
197                open_bracket_depth: self.bracket_depth,
198            });
199        }
200    }
201
202    /// Close every lambda body whose base column is `>= col` (i.e. that this line has
203    /// dedented out of), emitting the `Dedent`s for its body and restoring the
204    /// surrounding indentation context.
205    fn close_lambdas(&mut self, col: u32, at: TextSize) {
206        while self.lambda_stack.last().is_some_and(|ctx| col <= ctx.base) {
207            let base = self.lambda_stack.last().expect("checked").base;
208            while *self.indent_stack.last().expect("lambda base present") > base {
209                self.indent_stack.pop();
210                self.push_marker(SyntaxKind::Dedent, at);
211            }
212            let ctx = self.lambda_stack.pop().expect("checked");
213            self.indent_stack = ctx.saved_indent_stack;
214        }
215    }
216
217    /// Close lambda bodies whose enclosing bracket has just closed — a `)`/`]`/`}` that
218    /// drops `bracket_depth` to `new_depth` *mid-line*. Mirrors [`Self::close_lambdas`]
219    /// but is keyed on bracket depth instead of column, for the case where the closer
220    /// trails the last body statement on one line (`call(func(): … last())`). The
221    /// column-based path already handles a closer that sits on its own dedented line; a
222    /// lambda is only ever popped once, so the two paths never double-close.
223    fn close_lambdas_on_bracket(&mut self, new_depth: u32, at: TextSize) {
224        while self
225            .lambda_stack
226            .last()
227            .is_some_and(|ctx| ctx.open_bracket_depth > new_depth)
228        {
229            let base = self.lambda_stack.last().expect("checked").base;
230            while *self.indent_stack.last().expect("lambda base present") > base {
231                self.indent_stack.pop();
232                self.push_marker(SyntaxKind::Dedent, at);
233            }
234            let ctx = self.lambda_stack.pop().expect("checked");
235            self.indent_stack = ctx.saved_indent_stack;
236        }
237    }
238
239    /// Copy a blank / comment-only line's tokens unchanged (no structural markers),
240    /// only updating bracket depth so an open multiline literal stays open across it.
241    fn copy_verbatim(&mut self, line: &[RawToken]) {
242        for tok in line {
243            self.out.push(*tok);
244            if tok.kind != SyntaxKind::NewlinePhys {
245                self.track_bracket(tok.kind);
246            }
247        }
248    }
249
250    /// Compare `col` to the indent stack and push `Indent` / `Dedent` markers.
251    fn emit_indent_dedent(&mut self, col: u32, at: TextSize) {
252        let top = *self.indent_stack.last().expect("indent stack has a base 0");
253        if col > top {
254            self.indent_stack.push(col);
255            self.push_marker(SyntaxKind::Indent, at);
256        } else if col < top {
257            while *self.indent_stack.last().expect("base 0 guards the loop") > col {
258                self.indent_stack.pop();
259                self.push_marker(SyntaxKind::Dedent, at);
260            }
261            if *self.indent_stack.last().expect("non-empty") != col {
262                self.diags.push(IndentDiagnostic {
263                    range: TextRange::empty(at),
264                    message: "Unindent does not match any outer indentation level.".to_owned(),
265                });
266                self.indent_stack.push(col); // resync and keep going
267            }
268        }
269    }
270
271    /// The leading-whitespace column of a line (Godot's flat `tab_size` per tab, `+1`
272    /// per space). Pure — used for the lambda-context bookkeeping before deciding
273    /// whether to diagnose.
274    fn column(&self, line: &[RawToken]) -> u32 {
275        let Some(ws) = line.first().filter(|t| t.kind == SyntaxKind::Whitespace) else {
276            return 0;
277        };
278        self.src[ws.range]
279            .bytes()
280            .fold(0u32, |col, b| col + if b == b'\t' { TAB_SIZE } else { 1 })
281    }
282
283    /// Record any tab/space indentation diagnostics for a line (same-line mix;
284    /// cross-line inconsistency with the file's first indent character).
285    fn diagnose_indent(&mut self, line: &[RawToken]) {
286        let Some(ws) = line.first().filter(|t| t.kind == SyntaxKind::Whitespace) else {
287            return;
288        };
289        let text = &self.src[ws.range];
290        let mut saw_tab = false;
291        let mut saw_space = false;
292        for b in text.bytes() {
293            saw_tab |= b == b'\t';
294            saw_space |= b == b' ';
295        }
296        if saw_tab && saw_space {
297            self.diags.push(IndentDiagnostic {
298                range: ws.range,
299                message: "Mixed use of tabs and spaces for indentation.".to_owned(),
300            });
301        } else if let Some(first) = text.bytes().next() {
302            let this = if first == b'\t' {
303                IndentChar::Tab
304            } else {
305                IndentChar::Space
306            };
307            match self.indent_char {
308                None => self.indent_char = Some(this),
309                Some(file) if file != this => {
310                    let (used, before) = match this {
311                        IndentChar::Tab => ("tab", "space"),
312                        IndentChar::Space => ("space", "tab"),
313                    };
314                    self.diags.push(IndentDiagnostic {
315                        range: ws.range,
316                        message: format!(
317                            "Used {used} character for indentation instead of {before} as used before in the file."
318                        ),
319                    });
320                }
321                Some(_) => {}
322            }
323        }
324    }
325
326    /// At end of input, close any still-open lambda bodies, then terminate any open
327    /// block by popping the indent stack to 0.
328    fn finish(&mut self, at: TextSize) {
329        self.close_lambdas(0, at); // col 0 <= every base, so all lambdas close
330        while *self.indent_stack.last().expect("base 0") > 0 {
331            self.indent_stack.pop();
332            self.push_marker(SyntaxKind::Dedent, at);
333        }
334    }
335
336    fn track_bracket(&mut self, kind: SyntaxKind) {
337        match kind {
338            SyntaxKind::LParen | SyntaxKind::LBrack | SyntaxKind::LBrace => {
339                self.bracket_depth += 1;
340            }
341            SyntaxKind::RParen | SyntaxKind::RBrack | SyntaxKind::RBrace => {
342                self.bracket_depth = self.bracket_depth.saturating_sub(1);
343            }
344            _ => {}
345        }
346    }
347
348    fn push_marker(&mut self, kind: SyntaxKind, at: TextSize) {
349        self.out.push(RawToken {
350            kind,
351            range: TextRange::empty(at),
352        });
353    }
354}
355
356/// The end-of-source offset as a `TextSize`.
357fn src_end(src: &str) -> TextSize {
358    TextSize::of(src)
359}
360
361#[cfg(test)]
362mod tests {
363    use super::*;
364    use crate::tokenize;
365
366    fn prepass(src: &str) -> Vec<RawToken> {
367        run(&tokenize(src), src).0
368    }
369
370    /// Non-trivia kind sequence — shows the synthetic markers + real tokens, hiding
371    /// whitespace/comment noise.
372    fn structure(src: &str) -> Vec<SyntaxKind> {
373        prepass(src)
374            .into_iter()
375            .filter(|t| !t.kind.is_trivia())
376            .map(|t| t.kind)
377            .collect()
378    }
379
380    fn diagnostics(src: &str) -> Vec<IndentDiagnostic> {
381        run(&tokenize(src), src).1
382    }
383
384    /// The pre-pass must remain byte-exact: zero-width markers contribute nothing and
385    /// every original token is preserved.
386    fn assert_lossless(src: &str) {
387        let rebuilt: String = prepass(src).iter().map(|t| &src[t.range]).collect();
388        assert_eq!(rebuilt, src, "prepass not lossless for {src:?}");
389    }
390
391    fn count(src: &str, kind: SyntaxKind) -> usize {
392        structure(src).into_iter().filter(|&k| k == kind).count()
393    }
394
395    #[test]
396    fn nested_func_if_drives_indent_dedent() {
397        use SyntaxKind as S;
398        let src = "func f():\n\tif x:\n\t\treturn\n";
399        assert_lossless(src);
400        assert_eq!(
401            structure(src),
402            vec![
403                S::FuncKw,
404                S::Ident,
405                S::LParen,
406                S::RParen,
407                S::Colon,
408                S::Newline,
409                S::Indent,
410                S::IfKw,
411                S::Ident,
412                S::Colon,
413                S::Newline,
414                S::Indent,
415                S::ReturnKw,
416                S::Newline,
417                S::Dedent,
418                S::Dedent,
419            ]
420        );
421    }
422
423    #[test]
424    fn line_continuation_does_not_indent() {
425        // Case 1: a `\`-continued line never produces Newline/Indent mid-expression.
426        let src = "a = 1 + \\\n  2\n";
427        assert_lossless(src);
428        assert_eq!(count(src, SyntaxKind::Indent), 0);
429        assert_eq!(count(src, SyntaxKind::Newline), 1); // exactly one logical line
430    }
431
432    #[test]
433    fn multiline_brackets_suppress_indentation() {
434        // Case 2: newlines inside [] are not significant.
435        let src = "var a = [\n\t1,\n\t2,\n]\n";
436        assert_lossless(src);
437        assert_eq!(count(src, SyntaxKind::Indent), 0);
438        assert_eq!(count(src, SyntaxKind::Dedent), 0);
439        assert_eq!(count(src, SyntaxKind::Newline), 1); // one logical statement
440    }
441
442    #[test]
443    fn top_level_lambda_body_indents() {
444        // Case 4: a multiline lambda body at statement level indents normally.
445        use SyntaxKind as S;
446        let src = "var f = func():\n\tprint()\nx = 1\n";
447        assert_lossless(src);
448        assert_eq!(count(src, S::Indent), 1);
449        assert_eq!(count(src, S::Dedent), 1);
450    }
451
452    #[test]
453    fn blank_and_comment_only_lines_keep_state() {
454        // Cases 7 & 8: blank lines and a column-0 comment must not close the block.
455        let src = "func f():\n\tx = 1\n\n# top-level comment\n\ty = 2\n";
456        assert_lossless(src);
457        // Only one Indent (into the body) and one Dedent (at EOF) — the blank and the
458        // column-0 comment do not emit a Dedent.
459        assert_eq!(count(src, SyntaxKind::Indent), 1);
460        assert_eq!(count(src, SyntaxKind::Dedent), 1);
461    }
462
463    #[test]
464    fn inline_block_has_no_indent() {
465        // Case 9: `func f(): return 1` on one line never produces an Indent.
466        let src = "func f(): return 1\n";
467        assert_lossless(src);
468        assert_eq!(count(src, SyntaxKind::Indent), 0);
469        assert_eq!(count(src, SyntaxKind::Newline), 1);
470    }
471
472    #[test]
473    fn dedent_to_eof_without_trailing_newline() {
474        // Case 11: file ends mid-nest with no trailing newline.
475        use SyntaxKind as S;
476        let src = "func f():\n\tpass";
477        assert_lossless(src);
478        let s = structure(src);
479        assert_eq!(s.last(), Some(&S::Dedent));
480        assert_eq!(count(src, S::Indent), 1);
481        assert_eq!(count(src, S::Dedent), 1);
482        // The final unterminated line still gets a logical Newline.
483        assert!(s.contains(&S::Newline));
484    }
485
486    #[test]
487    fn empty_and_comment_only_files() {
488        // Case 12.
489        assert_lossless("");
490        assert_eq!(structure(""), Vec::<SyntaxKind>::new());
491        assert_lossless("# just a comment\n");
492        assert_eq!(count("# just a comment\n", SyntaxKind::Indent), 0);
493    }
494
495    #[test]
496    fn mixed_tabs_and_spaces_diagnoses_but_recovers() {
497        // Case 6: a tab+space mix on one indentation run is flagged, not fatal.
498        let src = "func f():\n \tpass\n";
499        assert_lossless(src);
500        let diags = diagnostics(src);
501        assert!(
502            diags
503                .iter()
504                .any(|d| d.message.contains("Mixed use of tabs and spaces")),
505            "expected a mixed-indent diagnostic, got {diags:?}"
506        );
507    }
508
509    #[test]
510    fn inconsistent_indent_char_across_lines_is_flagged() {
511        // First indented line uses a tab; a later one uses spaces → file-consistency
512        // diagnostic (first char wins).
513        let src = "func f():\n\ta = 1\nfunc g():\n    b = 2\n";
514        let diags = diagnostics(src);
515        assert!(
516            diags.iter().any(|d| d.message.contains("instead of")),
517            "expected an inconsistent-indent diagnostic, got {diags:?}"
518        );
519    }
520
521    #[test]
522    fn match_block_nests() {
523        use SyntaxKind as S;
524        let src = "match x:\n\t1:\n\t\tpass\n";
525        assert_lossless(src);
526        assert_eq!(count(src, S::Indent), 2);
527        assert_eq!(count(src, S::Dedent), 2);
528        assert_eq!(structure(src)[0], S::MatchKw);
529    }
530
531    #[test]
532    fn multiline_lambda_inside_brackets_indents() {
533        // A multiline lambda body inside an open `(` re-enables indentation.
534        use SyntaxKind as S;
535        let src = "arr.sort_custom(func(a, b):\n\treturn a < b\n)\n";
536        assert_lossless(src);
537        assert_eq!(count(src, S::Indent), 1, "lambda body should Indent once");
538        assert_eq!(count(src, S::Dedent), 1, "lambda body should Dedent once");
539        // One logical statement (the call), terminated after the closing `)`.
540        let s = structure(src);
541        // The Indent comes right after the lambda's `:` + Newline.
542        let colon = s.iter().position(|&k| k == S::Colon).unwrap();
543        assert_eq!(s[colon + 1], S::Newline);
544        assert_eq!(s[colon + 2], S::Indent);
545        // The Dedent comes before the closing `)`.
546        let rparen = s.iter().rposition(|&k| k == S::RParen).unwrap();
547        assert_eq!(s[rparen - 1], S::Dedent);
548    }
549
550    #[test]
551    fn lambda_inside_multiline_array() {
552        // A lambda living inside a multiline `[ ]` literal.
553        use SyntaxKind as S;
554        let src = "var a = [\n\tfunc():\n\t\tprint()\n]\n";
555        assert_lossless(src);
556        assert_eq!(count(src, S::Indent), 1);
557        assert_eq!(count(src, S::Dedent), 1);
558    }
559
560    #[test]
561    fn nested_lambdas_inside_brackets() {
562        use SyntaxKind as S;
563        let src = "outer(func():\n\tinner(func():\n\t\tbody\n\t)\n)\n";
564        assert_lossless(src);
565        assert_eq!(count(src, S::Indent), 2, "two nested lambda bodies");
566        assert_eq!(count(src, S::Dedent), 2);
567    }
568
569    #[test]
570    fn single_line_lambda_inside_brackets_has_no_indent() {
571        // The body is on the header line → no Indent/Dedent, one statement.
572        use SyntaxKind as S;
573        let src = "arr.map(func(x): x * 2)\n";
574        assert_lossless(src);
575        assert_eq!(count(src, S::Indent), 0);
576        assert_eq!(count(src, S::Dedent), 0);
577        assert_eq!(count(src, S::Newline), 1);
578    }
579}