Skip to main content

fresh/primitives/
indent_rules.rs

1//! Per-language, regex-based auto-indentation (pure Rust, WASM-safe).
2//!
3//! This is the per-language indentation tier described in
4//! `docs/internal/indentation-rules-design.md`. It sits between the
5//! tree-sitter AST tier ([`crate::primitives::indent`]) and the universal
6//! bracket heuristic ([`crate::primitives::indent_pattern`]).
7//!
8//! # What it does
9//!
10//! Each language is described by a small set of anchored regexes:
11//!
12//! - **increase** — if the *reference* line matches, the new line goes one
13//!   level deeper (e.g. a line ending with `{`, or a Ruby `def`).
14//! - **decrease** — if the *new* line's leading content matches, it drops one
15//!   level (e.g. a line starting with `}`, or a Ruby `end`).
16//! - **indent_next_line** — one-shot +1 for the immediately following line
17//!   only (braceless `if (x)`).
18//! - **dedent_next_line** — one-shot −1 (Python flow-exit `return`/`pass`/…,
19//!   Fresh's existing `@dedent_after`, issue #2192).
20//! - **self_close** — suppresses *increase* when the same line also closes the
21//!   block it opened (`def f; end`, `if x then y end`). This lets one-liners
22//!   avoid over-indenting without needing regex look-ahead (which the `regex`
23//!   crate does not support).
24//!
25//! Patterns use the [`regex`](https://docs.rs/regex) crate's syntax (linear,
26//! no look-around or back-references). They are matched against each line's
27//! *code view* — the line with comment and string spans blanked to spaces — so
28//! a bracket or keyword inside a string/comment never triggers indentation.
29//!
30//! # Avoiding glitches: scope masking
31//!
32//! The classic failure of regex indentation is triggering on a brace inside a
33//! string or a keyword inside a comment. Before matching, every line is turned
34//! into a **code view**: bytes that the caller reports as comment/string are
35//! replaced with spaces. The caller sources that judgement from the syntax
36//! highlighter's *already-computed* render spans
37//! ([`crate::primitives::highlight_engine::HighlightEngine::category_at_position`]),
38//! so there is no second parse — we reuse the work rendering already did. When
39//! no scope information is available (line outside the render cache, or a plain
40//! buffer) the code view is the raw line, which degrades to plain regex
41//! matching rather than misbehaving.
42//!
43//! # Cost
44//!
45//! Per Enter: one backward scan for the reference line plus 2–4 single-line
46//! regex matches on short masked strings. No parsing, no tree. Rule sets are
47//! compiled once (lazily) and shared across all languages in a family.
48
49use crate::model::buffer::Buffer;
50use once_cell::sync::Lazy;
51use regex::Regex;
52use std::collections::HashMap;
53use std::sync::{Arc, RwLock};
54
55/// A language family. Most languages map to one of these; the per-language
56/// table ([`family_for_id`]) is data, so adding a language is one row.
57#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
58pub enum Family {
59    /// C, C++, C#, Java, Rust, Go, JS, TS, PHP, Swift, Kotlin, Dart, CSS,
60    /// SCSS, JSON, … — block structure is `{ } [ ] ( )`.
61    CurlyBrace,
62    /// Python — `:` opens a block; flow-exit statements dedent the next line.
63    Python,
64    /// Ruby — `def…end`, `do…end`, midblock `else`/`when`/`rescue`.
65    RubyLike,
66    /// Lua — `function…end`, `if…then…end`, `for…do…end`, `repeat…until`.
67    LuaLike,
68    /// Bash — `if…then…fi`, `for/while…do…done`, `case…esac`, `{ }`.
69    BashLike,
70    /// Fish — `if…end`, `for…end`, `function…end`, `switch/case…end`.
71    FishLike,
72    /// Pascal — `begin…end`, `case…of…end`, `repeat…until`.
73    PascalLike,
74    /// Smali — dot-directive blocks such as `.method` ... `.end method`,
75    /// plus brace-delimited register/value lists.
76    SmaliLike,
77}
78
79/// String form of a rule set (what a family or user config provides).
80/// Every field is optional; `None` means "never matches".
81#[derive(Debug, Clone, Default)]
82pub struct IndentRulesDef {
83    pub increase: Option<&'static str>,
84    pub decrease: Option<&'static str>,
85    pub indent_next_line: Option<&'static str>,
86    pub dedent_next_line: Option<&'static str>,
87    pub self_close: Option<&'static str>,
88    /// True for indentation-significant languages (Python, YAML, …) where
89    /// indentation *is* the block structure. For these, pressing Enter on a
90    /// blank line keeps the cursor's current column instead of re-deriving from
91    /// an earlier line — a manual dedent must stick. Brace/keyword languages
92    /// leave this false: their structure makes the indent unambiguous, so
93    /// re-deriving is correct.
94    pub indentation_significant: bool,
95}
96
97/// Compiled, cached form of [`IndentRulesDef`].
98pub struct IndentRules {
99    increase: Option<Regex>,
100    decrease: Option<Regex>,
101    indent_next_line: Option<Regex>,
102    dedent_next_line: Option<Regex>,
103    self_close: Option<Regex>,
104    indentation_significant: bool,
105}
106
107impl IndentRules {
108    fn compile(def: &IndentRulesDef) -> Self {
109        Self::compile_parts(
110            def.increase,
111            def.decrease,
112            def.indent_next_line,
113            def.dedent_next_line,
114            def.self_close,
115            def.indentation_significant,
116        )
117    }
118
119    /// Compile from individual pattern strings of any lifetime. A pattern that
120    /// fails to compile is dropped (treated as "never matches") rather than
121    /// panicking — for built-in rules a bad pattern is a programmer error; for
122    /// user config it keeps one typo'd rule from taking the editor down.
123    fn compile_parts(
124        increase: Option<&str>,
125        decrease: Option<&str>,
126        indent_next_line: Option<&str>,
127        dedent_next_line: Option<&str>,
128        self_close: Option<&str>,
129        indentation_significant: bool,
130    ) -> Self {
131        let c = |p: Option<&str>| p.and_then(|s| Regex::new(s).ok());
132        Self {
133            indentation_significant,
134            increase: c(increase),
135            decrease: c(decrease),
136            indent_next_line: c(indent_next_line),
137            dedent_next_line: c(dedent_next_line),
138            self_close: c(self_close),
139        }
140    }
141
142    /// Indent (in visual columns) for a new line inserted at `position`.
143    ///
144    /// `is_code(byte)` returns `false` for bytes inside a comment or string;
145    /// see the module docs. Pass `|_| true` to disable masking.
146    pub fn calculate_indent<F: Fn(usize) -> bool>(
147        &self,
148        buffer: &Buffer,
149        position: usize,
150        tab_size: usize,
151        is_code: F,
152    ) -> usize {
153        let unit = tab_size.max(1);
154
155        let cur = line_bounds(buffer, position);
156        let cur_has_content = first_nonws(buffer, cur.start, position).is_some();
157
158        // Indentation-significant languages only (Python, …): cursor on a
159        // whitespace-only stretch with nothing after it on the line (a blank
160        // line, or trailing whitespace) preserves the cursor's current column.
161        // Pressing Enter must keep a manual dedent — once the user has stepped
162        // out from under an earlier block, re-deriving the indent from that
163        // block (pulling them back in) is wrong, and in a layout-defined
164        // language only the user can say which block the next line belongs to.
165        // Brace/keyword languages skip this: their structure is unambiguous, so
166        // the normal derivation below is correct. A closing delimiter *after*
167        // the cursor (`    │}`) is handled by the normal path regardless.
168        if self.indentation_significant
169            && !cur_has_content
170            && first_nonws(buffer, position, cur.end).is_none()
171        {
172            return visual_indent(buffer, cur.start, position, tab_size);
173        }
174
175        // Reference line: the current line's content above the split if it has
176        // any, else the nearest previous non-blank line. Mirrors the structure
177        // of `indent_pattern::calculate_indent_pattern`.
178        let reference = if cur_has_content {
179            Some(LineSpan {
180                start: cur.start,
181                end: position,
182            })
183        } else {
184            prev_nonblank_line(buffer, cur.start)
185        };
186
187        let Some(reference) = reference else {
188            return 0;
189        };
190        let base = visual_indent(buffer, reference.start, reference.end, tab_size);
191        let ref_code = code_view(buffer, reference.start, reference.end, &is_code);
192
193        let mut indent = base;
194        let opened = self.increases(&ref_code) || matches(&self.indent_next_line, &ref_code);
195        if opened {
196            indent += unit;
197        } else if matches(&self.dedent_next_line, &ref_code) {
198            indent = indent.saturating_sub(unit);
199        }
200
201        // The new line's tail (text that moves down past the cursor). A leading
202        // `}` / `end` here dedents the line being created — UNLESS the opener we
203        // just counted is on this same line (the `{│}` case: cursor between a
204        // freshly typed/auto-closed pair). There the closer is relocated to its
205        // own line by the editor's bracket-expansion, so it must not cancel the
206        // cursor line's one-level indent. Only a closer paired with an opener on
207        // a *previous* reference line genuinely dedents (e.g. `{\n    │}`).
208        let tail = code_view(buffer, position, cur.end, &is_code);
209        let opener_on_current_line = opened && cur_has_content;
210        if matches(&self.decrease, &tail) && !opener_on_current_line {
211            indent = indent.saturating_sub(unit);
212        }
213
214        indent
215    }
216
217    /// Indent for a line whose first typed character is the closing delimiter
218    /// `ch` (`}`, `]`, `)`). Returns `None` when this language has no decrease
219    /// rule (so the caller can fall back).
220    pub fn calculate_dedent_for_delimiter<F: Fn(usize) -> bool>(
221        &self,
222        buffer: &Buffer,
223        position: usize,
224        ch: char,
225        tab_size: usize,
226        is_code: F,
227    ) -> Option<usize> {
228        let probe = format!("{ch}");
229        if !matches(&self.decrease, &probe) {
230            return None;
231        }
232        let unit = tab_size.max(1);
233        let cur = line_bounds(buffer, position);
234        let reference = prev_nonblank_line(buffer, cur.start)?;
235        let base = visual_indent(buffer, reference.start, reference.end, tab_size);
236        let ref_code = code_view(buffer, reference.start, reference.end, &is_code);
237
238        let mut indent = base;
239        if self.increases(&ref_code) {
240            indent += unit;
241        }
242        // The closer dedents one level back to its opener.
243        Some(indent.saturating_sub(unit))
244    }
245
246    /// `increase` matches and the line does not also self-close.
247    fn increases(&self, code: &str) -> bool {
248        matches(&self.increase, code) && !matches(&self.self_close, code)
249    }
250}
251
252fn matches(re: &Option<Regex>, text: &str) -> bool {
253    re.as_ref().is_some_and(|r| r.is_match(text))
254}
255
256/// Look up the effective rules for a language id (e.g. `"rust"`, `"ruby"`).
257///
258/// A user override registered via [`set_user_rule`] (from a
259/// `[languages.<id>.indent]` config block) takes precedence over the built-in
260/// family. Returns `None` when neither exists, so the caller falls back to the
261/// generic bracket heuristic.
262pub fn rules_for_id(id: &str) -> Option<Arc<IndentRules>> {
263    if let Some(rules) = user_rule_for_id(id) {
264        return Some(rules);
265    }
266    let family = family_for_id(id)?;
267    FAMILY_RULES.get(&family).cloned()
268}
269
270/// Look up *only* a user override registered via [`set_user_rule`] for `id`,
271/// ignoring the built-in family fallback.
272///
273/// This is what connects a `[languages.<id>.indent]` block to the editor: a
274/// config-only language (no syntect grammar / no tree-sitter) is keyed by its
275/// config id, which the syntax-name-based lookup can never resolve. Callers that
276/// want *only* the user's explicit rules — and must otherwise defer to the
277/// existing tiering (e.g. the C-style bracket scanner for curly-brace
278/// languages) — use this rather than [`rules_for_id`], whose family fallback
279/// would shadow that tiering.
280pub fn user_rule_for_id(id: &str) -> Option<Arc<IndentRules>> {
281    USER_RULES.read().unwrap().get(id).cloned()
282}
283
284/// Look up rules from a syntect display name (e.g. `"C++"`, `"C#"`,
285/// `"Kotlin"`). Normalizes the common verbose/aliased names then defers to
286/// [`rules_for_id`]. Used by the no-tree-sitter indent path, which only has a
287/// syntect syntax name to go on.
288pub fn rules_for_syntax_name(name: &str) -> Option<Arc<IndentRules>> {
289    let lower = name.to_ascii_lowercase();
290    let id = match lower.as_str() {
291        "c++" => "cpp",
292        "c#" => "csharp",
293        n if n.contains("typescript") => "typescript",
294        n if n.contains("javascript") => "javascript",
295        // syntect ships bash as "Bourne Again Shell (bash)".
296        n if n.contains("bash") || n.contains("shell") => "bash",
297        other => other,
298    };
299    rules_for_id(id)
300}
301
302/// Map a normalized language id to its family. This is the extension point:
303/// adding a language is usually one arm here.
304fn family_for_id(id: &str) -> Option<Family> {
305    let f = match id {
306        "rust" | "c" | "cpp" | "c++" | "csharp" | "c_sharp" | "java" | "go" | "javascript"
307        | "typescript" | "typescriptreact" | "javascriptreact" | "php" | "swift" | "kotlin"
308        | "dart" | "scala" | "json" | "jsonc" | "css" | "scss" | "less" => Family::CurlyBrace,
309        "python" => Family::Python,
310        "ruby" => Family::RubyLike,
311        "lua" => Family::LuaLike,
312        "bash" | "sh" | "shell" | "shellscript" => Family::BashLike,
313        "fish" => Family::FishLike,
314        "pascal" => Family::PascalLike,
315        "smali" => Family::SmaliLike,
316        _ => return None,
317    };
318    Some(f)
319}
320
321/// The built-in `IndentRulesDef` for a family. Used both to build
322/// [`FAMILY_RULES`] and as the base a user override is layered onto.
323fn def_for_family(family: Family) -> &'static IndentRulesDef {
324    match family {
325        Family::CurlyBrace => &CURLY_BRACE,
326        Family::Python => &PYTHON,
327        Family::RubyLike => &RUBY_LIKE,
328        Family::LuaLike => &LUA_LIKE,
329        Family::BashLike => &BASH_LIKE,
330        Family::FishLike => &FISH_LIKE,
331        Family::PascalLike => &PASCAL_LIKE,
332        Family::SmaliLike => &SMALI_LIKE,
333    }
334}
335
336/// Compiled rules per family, built once on first use.
337static FAMILY_RULES: Lazy<HashMap<Family, Arc<IndentRules>>> = Lazy::new(|| {
338    let mut m = HashMap::new();
339    for family in [
340        Family::CurlyBrace,
341        Family::Python,
342        Family::RubyLike,
343        Family::LuaLike,
344        Family::BashLike,
345        Family::FishLike,
346        Family::PascalLike,
347        Family::SmaliLike,
348    ] {
349        m.insert(
350            family,
351            Arc::new(IndentRules::compile(def_for_family(family))),
352        );
353    }
354    m
355});
356
357/// User-supplied indentation rules from `[languages.<id>.indent]`, keyed by
358/// language id. Checked before [`FAMILY_RULES`] in [`rules_for_id`]. Rebuilt by
359/// [`clear_user_rules`] + [`set_user_rule`] whenever config is (re)loaded.
360static USER_RULES: Lazy<RwLock<HashMap<String, Arc<IndentRules>>>> =
361    Lazy::new(|| RwLock::new(HashMap::new()));
362
363/// Drop all user overrides. Call before re-applying config so removed blocks
364/// stop taking effect.
365pub fn clear_user_rules() {
366    USER_RULES.write().unwrap().clear();
367}
368
369/// Register a user override for `id` (e.g. from `[languages.rust.indent]`).
370///
371/// Any pattern left `None` inherits from the language's built-in family (so a
372/// config that sets only `increase_indent_pattern` keeps the family's
373/// `decrease`/`self_close`); a language with no family starts from blank rules,
374/// which is how config can add indentation for an otherwise-unknown language.
375/// Patterns are regexes evaluated against the line's code view (comment/string
376/// spans masked out); see the module docs.
377pub fn set_user_rule(
378    id: &str,
379    increase: Option<&str>,
380    decrease: Option<&str>,
381    indent_next_line: Option<&str>,
382    dedent_next_line: Option<&str>,
383    self_close: Option<&str>,
384) {
385    // Inherit each unset pattern (and the indentation-significant flag) from the
386    // built-in family, if any.
387    let base = family_for_id(id).map(def_for_family);
388    let rules = IndentRules::compile_parts(
389        increase.or(base.and_then(|d| d.increase)),
390        decrease.or(base.and_then(|d| d.decrease)),
391        indent_next_line.or(base.and_then(|d| d.indent_next_line)),
392        dedent_next_line.or(base.and_then(|d| d.dedent_next_line)),
393        self_close.or(base.and_then(|d| d.self_close)),
394        base.map(|d| d.indentation_significant).unwrap_or(false),
395    );
396    USER_RULES
397        .write()
398        .unwrap()
399        .insert(id.to_string(), Arc::new(rules));
400}
401
402const CURLY_BRACE: IndentRulesDef = IndentRulesDef {
403    // Line ends opening a block/group. Trailing whitespace (and masked
404    // comments) are eaten by `\s*$`.
405    increase: Some(r"[\{\[\(]\s*$"),
406    // Line begins by closing one.
407    decrease: Some(r"^\s*[\}\]\)]"),
408    // Braceless control head: `if (..)`, `for (..)`, `while (..)`, or `else`.
409    indent_next_line: Some(r"^\s*((if|for|while)\b.*\)|else)\s*$"),
410    dedent_next_line: None,
411    self_close: None,
412    indentation_significant: false,
413};
414
415const SMALI_LIKE: IndentRulesDef = IndentRulesDef {
416    increase: Some(
417        r"(?x)
418        (^\s*\.
419            (?:method|annotation|subannotation|packed-switch|sparse-switch|array-data|param|parameter)
420            \b
421        )
422        |
423        ([\{\[\(]\s*$)
424        ",
425    ),
426    decrease: Some(
427        r"(?x)^\s*
428        (?:
429            \.end\s+
430                (?:method|annotation|subannotation|packed-switch|sparse-switch|array-data|param|parameter)\b
431            |
432            [\}\]\)]
433        )
434        ",
435    ),
436    indent_next_line: None,
437    dedent_next_line: None,
438    self_close: Some(
439        r"(?x)^\s*\.
440            (?:method|annotation|subannotation|packed-switch|sparse-switch|array-data|param|parameter)
441            \b.*\s\.end\s+
442            (?:method|annotation|subannotation|packed-switch|sparse-switch|array-data|param|parameter)\b
443        ",
444    ),
445    indentation_significant: false,
446};
447
448const PYTHON: IndentRulesDef = IndentRulesDef {
449    increase: Some(r":\s*$"),
450    // Best-effort: a moved-down midblock keyword dedents to its header.
451    decrease: Some(r"^\s*(elif|else|except|finally|case)\b"),
452    indent_next_line: None,
453    dedent_next_line: Some(r"^\s*(return|pass|raise|break|continue)\b"),
454    self_close: None,
455    indentation_significant: true,
456};
457
458const RUBY_LIKE: IndentRulesDef = IndentRulesDef {
459    // Block-opening keywords at line start, OR a trailing `do`/`do |x|`.
460    increase: Some(
461        r"(^\s*(if|unless|while|until|for|begin|def|class|module|case|else|elsif|when|in|rescue|ensure)\b)|(\bdo(\s*\|[^|]*\|)?\s*$)",
462    ),
463    // `end` and midblock keywords dedent their own line.
464    decrease: Some(r"^\s*(end|else|elsif|when|in|rescue|ensure)\b"),
465    indent_next_line: None,
466    dedent_next_line: None,
467    // Suppress increase for one-liners like `def f; end` / `if x then y end`.
468    self_close: Some(r"\bend\b"),
469    indentation_significant: false,
470};
471
472const LUA_LIKE: IndentRulesDef = IndentRulesDef {
473    increase: Some(
474        r"(^\s*((local\s+)?function|if|elseif|else|for|while|repeat)\b)|(\b(do|then)\s*$)",
475    ),
476    decrease: Some(r"^\s*(end|else|elseif|until)\b"),
477    indent_next_line: None,
478    dedent_next_line: None,
479    self_close: Some(r"\bend\b"),
480    indentation_significant: false,
481};
482
483const BASH_LIKE: IndentRulesDef = IndentRulesDef {
484    // `then`/`do` line ends, `case … in`, or a function body's opening `{`.
485    // Note: `(` is deliberately excluded — in Bash `$(...)` / `(...)` is a
486    // subshell/command-substitution, not an indented block, so a trailing `(`
487    // must not deepen indent (mirrors the grammar's indents.scm).
488    increase: Some(r"(\b(then|do)\s*$)|(^\s*case\b.*\bin\s*$)|(\{\s*$)"),
489    decrease: Some(r"^\s*(fi|done|esac|else|elif|\})"),
490    indent_next_line: None,
491    dedent_next_line: None,
492    self_close: None,
493    indentation_significant: false,
494};
495
496const FISH_LIKE: IndentRulesDef = IndentRulesDef {
497    increase: Some(r"^\s*(if|else|for|while|begin|function|switch|case)\b"),
498    decrease: Some(r"^\s*(end|else|case)\b"),
499    indent_next_line: None,
500    dedent_next_line: None,
501    self_close: Some(r"\bend\b"),
502    indentation_significant: false,
503};
504
505const PASCAL_LIKE: IndentRulesDef = IndentRulesDef {
506    increase: Some(r"(^\s*(begin|case|record|try|repeat|asm)\b)|(\b(begin|of)\s*$)"),
507    decrease: Some(r"^\s*(end|until|except|finally)\b"),
508    indent_next_line: None,
509    dedent_next_line: None,
510    self_close: Some(r"\bend\b"),
511    indentation_significant: false,
512};
513
514// ---------------------------------------------------------------------------
515// Line geometry helpers (byte-oriented, tab-aware). Kept local so the module
516// has no dependency on the tree-sitter `indent` module.
517// ---------------------------------------------------------------------------
518
519#[derive(Clone, Copy)]
520struct LineSpan {
521    start: usize,
522    end: usize,
523}
524
525fn byte_at(buffer: &Buffer, pos: usize) -> Option<u8> {
526    if pos >= buffer.len() {
527        return None;
528    }
529    buffer.slice_bytes(pos..pos + 1).first().copied()
530}
531
532/// Bounds of the line containing `position`: `start` is just after the
533/// preceding `\n` (or 0); `end` is the next `\n` or buffer end.
534fn line_bounds(buffer: &Buffer, position: usize) -> LineSpan {
535    let mut start = position;
536    while start > 0 && byte_at(buffer, start - 1) != Some(b'\n') {
537        start -= 1;
538    }
539    let mut end = position;
540    while end < buffer.len() && byte_at(buffer, end) != Some(b'\n') {
541        end += 1;
542    }
543    LineSpan { start, end }
544}
545
546/// First non-whitespace byte position in `[start, end)`, if any.
547fn first_nonws(buffer: &Buffer, start: usize, end: usize) -> Option<usize> {
548    let mut p = start;
549    while p < end {
550        match byte_at(buffer, p) {
551            Some(b' ') | Some(b'\t') | Some(b'\r') => p += 1,
552            Some(_) => return Some(p),
553            None => return None,
554        }
555    }
556    None
557}
558
559/// Nearest non-blank line strictly above the line starting at `line_start`.
560fn prev_nonblank_line(buffer: &Buffer, line_start: usize) -> Option<LineSpan> {
561    if line_start == 0 {
562        return None;
563    }
564    let mut pos = line_start - 1; // the '\n' ending the previous line
565    loop {
566        let span = line_bounds(buffer, pos);
567        if first_nonws(buffer, span.start, span.end).is_some() {
568            return Some(span);
569        }
570        if span.start == 0 {
571            return None;
572        }
573        pos = span.start - 1;
574    }
575}
576
577/// Visual indent width of `[start, end)` (tabs expand to `tab_size`).
578fn visual_indent(buffer: &Buffer, start: usize, end: usize, tab_size: usize) -> usize {
579    let mut indent = 0;
580    let mut p = start;
581    while p < end {
582        match byte_at(buffer, p) {
583            Some(b' ') => indent += 1,
584            Some(b'\t') => indent += tab_size,
585            _ => break,
586        }
587        p += 1;
588    }
589    indent
590}
591
592/// The line `[start, end)` as a string with comment/string bytes (per
593/// `is_code`) blanked to spaces, and `\r` dropped. See module docs.
594fn code_view<F: Fn(usize) -> bool>(
595    buffer: &Buffer,
596    start: usize,
597    end: usize,
598    is_code: &F,
599) -> String {
600    let bytes = buffer.slice_bytes(start..end);
601    let mut out = String::with_capacity(bytes.len());
602    for (i, &b) in bytes.iter().enumerate() {
603        if b == b'\r' || b == b'\n' {
604            continue;
605        }
606        // Non-ASCII bytes inside identifiers/strings: keep as-is only when code.
607        if is_code(start + i) {
608            out.push(b as char);
609        } else {
610            out.push(' ');
611        }
612    }
613    out
614}
615
616#[cfg(test)]
617mod tests {
618    use super::*;
619    use crate::model::filesystem::NoopFileSystem;
620    use std::sync::Arc;
621
622    fn buf(content: &str) -> Buffer {
623        let fs = Arc::new(NoopFileSystem);
624        let mut b = Buffer::empty(fs);
625        b.insert(0, content);
626        b
627    }
628
629    /// Indent at end of buffer, no scope masking.
630    fn indent(id: &str, content: &str, tab: usize) -> usize {
631        rules_for_id(id)
632            .unwrap()
633            .calculate_indent(&buf(content), content.len(), tab, |_| true)
634    }
635
636    /// Indent at end of buffer, masking the given byte ranges as non-code
637    /// (i.e. inside a string/comment).
638    fn indent_masked(id: &str, content: &str, tab: usize, masked: &[(usize, usize)]) -> usize {
639        let b = buf(content);
640        let is_code = |byte: usize| !masked.iter().any(|&(s, e)| byte >= s && byte < e);
641        rules_for_id(id)
642            .unwrap()
643            .calculate_indent(&b, content.len(), tab, is_code)
644    }
645
646    // ---- CurlyBrace -------------------------------------------------------
647
648    #[test]
649    fn curly_indents_after_open_brace() {
650        assert_eq!(indent("rust", "fn main() {\n", 4), 4);
651        assert_eq!(indent("typescript", "function f() {\n", 4), 4);
652    }
653
654    #[test]
655    fn curly_no_indent_after_balanced_line() {
656        assert_eq!(indent("rust", "let x = 1;\n", 4), 0);
657        // One-liner body: ends with `}`, must not indent.
658        assert_eq!(indent("rust", "fn x() { return 1; }\n", 4), 0);
659    }
660
661    #[test]
662    fn curly_dedents_before_close_brace() {
663        // Press enter inside `{│}` style: the tail `}` dedents.
664        let content = "fn main() {\n    }";
665        let pos = content.len() - 1; // just before `}`
666        let b = buf(content);
667        let got = rules_for_id("rust")
668            .unwrap()
669            .calculate_indent(&b, pos, 4, |_| true);
670        assert_eq!(got, 0);
671    }
672
673    #[test]
674    fn curly_braceless_if_indents_next_line_only() {
675        assert_eq!(indent("c", "if (x)\n", 4), 4);
676    }
677
678    #[test]
679    fn curly_dedent_for_typed_brace() {
680        let content = "fn main() {\n    body\n";
681        let dedent = rules_for_id("rust")
682            .unwrap()
683            .calculate_dedent_for_delimiter(&buf(content), content.len(), '}', 4, |_| true);
684        assert_eq!(dedent, Some(0));
685    }
686
687    // ---- Anti-glitch corpus (the headline cases) --------------------------
688
689    #[test]
690    fn no_indent_for_brace_in_string() {
691        // `let x = "{";` — the `{` is inside a string literal.
692        let content = "let x = \"{\";\n";
693        let open = content.find('{').unwrap();
694        // Mask the string contents (and quotes) so the `{` is not code.
695        let masked = [(content.find('"').unwrap(), open + 2)];
696        assert_eq!(indent_masked("rust", content, 4, &masked), 0);
697        // Sanity: without masking the naive matcher would wrongly indent.
698        assert_eq!(indent("rust", content, 4), 0); // still 0 here: `;` ends line
699    }
700
701    #[test]
702    fn no_indent_for_trailing_brace_in_comment() {
703        // `foo() // {` — trailing `{` lives in a line comment.
704        let content = "foo() // {\n";
705        let cstart = content.find("//").unwrap();
706        let masked = [(cstart, content.len())];
707        assert_eq!(indent_masked("rust", content, 4, &masked), 0);
708    }
709
710    #[test]
711    fn brace_in_comment_does_not_defeat_real_open() {
712        // `if (x) { // start {` → real `{` plus a decoy in the comment.
713        let content = "if (x) { // start {\n";
714        let cstart = content.find("//").unwrap();
715        let masked = [(cstart, content.len())];
716        // Masked view ends with the real `{` then spaces → one level.
717        assert_eq!(indent_masked("rust", content, 4, &masked), 4);
718    }
719
720    // ---- Python -----------------------------------------------------------
721    // Indent is taken at the end of the content (cursor on the line being split),
722    // mirroring an Enter pressed at end-of-line in the editor.
723
724    #[test]
725    fn python_indents_after_colon() {
726        assert_eq!(indent("python", "def foo():", 4), 4);
727        assert_eq!(indent("python", "if x:", 4), 4);
728    }
729
730    #[test]
731    fn python_dedents_after_return() {
732        let content = "def foo():\n    return 1";
733        assert_eq!(indent("python", content, 4), 0);
734    }
735
736    #[test]
737    fn python_keeps_indent_inside_body() {
738        let content = "def foo():\n    x = 1";
739        assert_eq!(indent("python", content, 4), 4);
740    }
741
742    #[test]
743    fn python_blank_line_keeps_manual_dedent() {
744        // After an `if x:` block the user backspaces the auto-indent to column 0
745        // on the blank line, then presses Enter: the new line must stay at 0,
746        // not be pulled back under the block. (Indentation-significant: only the
747        // user can say which block the next line belongs to.)
748        let content = "if x:\n    foo()\n"; // cursor on the trailing blank line, col 0
749        assert_eq!(indent("python", content, 4), 0);
750    }
751
752    #[test]
753    fn python_blank_line_maintains_current_column() {
754        // On a blank line whose whitespace the user left at the body column,
755        // Enter keeps that column (does not collapse).
756        let content = "if x:\n    foo()\n    "; // trailing 4 spaces, cursor at col 4
757        assert_eq!(indent("python", content, 4), 4);
758    }
759
760    #[test]
761    fn curly_blank_line_rederives_not_preserved() {
762        // Contrast: brace languages are NOT indentation-significant, so a blank
763        // line re-derives from the structure (here: still inside `{`), rather
764        // than preserving a column. `fn f() {` + blank line → one level in.
765        assert_eq!(indent("rust", "fn f() {\n", 4), 4);
766    }
767
768    #[test]
769    fn python_colon_in_string_does_not_indent() {
770        // `x = {"a": 1}` ends with `}` not `:`, but check a dict-literal colon
771        // inside a string is ignored: `s = "key:"`.
772        let content = "s = \"key:\"";
773        let q1 = content.find('"').unwrap();
774        let q2 = content.rfind('"').unwrap();
775        let masked = [(q1, q2 + 1)];
776        assert_eq!(indent_masked("python", content, 4, &masked), 0);
777    }
778
779    // ---- RubyLike ---------------------------------------------------------
780
781    #[test]
782    fn ruby_indents_after_def_and_do() {
783        assert_eq!(indent("ruby", "def foo\n", 2), 2);
784        assert_eq!(indent("ruby", "[1,2].each do |n|\n", 2), 2);
785    }
786
787    #[test]
788    fn ruby_one_liner_with_end_does_not_indent() {
789        assert_eq!(indent("ruby", "def foo; end\n", 2), 0);
790        assert_eq!(indent("ruby", "if x then y end\n", 2), 0);
791    }
792
793    #[test]
794    fn ruby_end_in_string_does_not_dedent_or_break() {
795        // `s = "end"` must not be treated as a block keyword.
796        let content = "x = 1\ns = \"end\"\n";
797        let q1 = content.rfind('"').unwrap();
798        // mask the whole quoted "end"
799        let qs = content[..q1].rfind('"').unwrap();
800        let masked = [(qs, q1 + 1)];
801        // reference line `s = "end"` → masked `s =      ` → no opener, indent 0.
802        assert_eq!(indent_masked("ruby", content, 2, &masked), 0);
803    }
804
805    #[test]
806    fn ruby_midblock_else_reindents_body() {
807        // After an `else` line, the body indents one level from the else.
808        let content = "if x\n  a\nelse\n";
809        assert_eq!(indent("ruby", content, 2), 2);
810    }
811
812    // ---- LuaLike ----------------------------------------------------------
813
814    #[test]
815    fn lua_indents_after_block_openers() {
816        assert_eq!(indent("lua", "function f()\n", 4), 4);
817        assert_eq!(indent("lua", "if x then\n", 4), 4);
818        assert_eq!(indent("lua", "for i = 1, n do\n", 4), 4);
819    }
820
821    #[test]
822    fn lua_one_liner_with_end_does_not_indent() {
823        assert_eq!(indent("lua", "function f() end\n", 4), 0);
824    }
825
826    // ---- BashLike ---------------------------------------------------------
827
828    #[test]
829    fn bash_indents_after_then_do_case() {
830        assert_eq!(indent("bash", "if true; then\n", 4), 4);
831        assert_eq!(indent("bash", "for x in a b; do\n", 4), 4);
832        assert_eq!(indent("bash", "case $x in\n", 4), 4);
833    }
834
835    #[test]
836    fn bash_resolves_from_syntect_name() {
837        // syntect names bash "Bourne Again Shell (bash)".
838        assert!(rules_for_syntax_name("Bourne Again Shell (bash)").is_some());
839    }
840
841    #[test]
842    fn fish_indents_after_block_openers() {
843        assert_eq!(indent("fish", "if test -n \"$name\"\n", 4), 4);
844        assert_eq!(indent("fish", "for item in $items\n", 4), 4);
845        assert_eq!(
846            indent("fish", "function greet --argument-names name\n", 4),
847            4
848        );
849        assert_eq!(indent("fish", "switch $name\n", 4), 4);
850        assert_eq!(indent("fish", "case fresh\n", 4), 4);
851    }
852
853    #[test]
854    fn fish_one_liner_with_end_does_not_indent() {
855        assert_eq!(
856            indent("fish", "if test -n \"$name\"; echo $name; end\n", 4),
857            0
858        );
859    }
860
861    // ---- PascalLike -------------------------------------------------------
862
863    #[test]
864    fn pascal_indents_after_begin() {
865        assert_eq!(indent("pascal", "begin\n", 4), 4);
866        assert_eq!(indent("pascal", "if x then begin\n", 4), 4);
867    }
868
869    #[test]
870    fn pascal_one_liner_with_end_does_not_indent() {
871        assert_eq!(indent("pascal", "begin end;\n", 4), 0);
872    }
873
874    // ---- SmaliLike --------------------------------------------------------
875
876    #[test]
877    fn smali_indents_after_method_directive() {
878        assert_eq!(
879            indent(
880                "smali",
881                ".method public onCreate(Landroid/os/Bundle;)V\n",
882                4
883            ),
884            4
885        );
886    }
887
888    #[test]
889    fn smali_dedents_before_end_directive() {
890        let content = ".method public main()V\n    .end method";
891        let pos = content.find(".end method").unwrap();
892        let got = rules_for_id("smali")
893            .unwrap()
894            .calculate_indent(&buf(content), pos, 4, |_| true);
895        assert_eq!(got, 0);
896    }
897
898    #[test]
899    fn smali_plain_field_does_not_indent() {
900        assert_eq!(indent("smali", ".field public static count:I = 0\n", 4), 0);
901    }
902
903    // ---- registry ---------------------------------------------------------
904
905    #[test]
906    fn unknown_language_has_no_rules() {
907        assert!(rules_for_id("brainfuck").is_none());
908    }
909
910    #[test]
911    fn families_compile() {
912        // Force the lazy table; a bad regex would drop to None and fail above.
913        assert!(rules_for_id("rust").unwrap().increase.is_some());
914        assert!(rules_for_id("python").unwrap().dedent_next_line.is_some());
915        assert!(rules_for_id("ruby").unwrap().self_close.is_some());
916    }
917
918    // A single test owns the global USER_RULES mutation so it can't race the
919    // other (read-only) tests under the parallel runner.
920    #[test]
921    fn user_overrides_register_and_merge() {
922        clear_user_rules();
923
924        // Full override for a language with no built-in family: config can add
925        // indentation for a language Fresh otherwise doesn't know.
926        set_user_rule(
927            "zz_newlang",
928            Some(r":\s*$"),
929            Some(r"^\s*end\b"),
930            None,
931            None,
932            None,
933        );
934        let r = rules_for_id("zz_newlang").expect("user rule registered");
935        assert_eq!(r.calculate_indent(&buf("foo:"), 4, 4, |_| true), 4);
936
937        // Partial override merges with the family: overriding `increase` only on
938        // a CurlyBrace language keeps the family's `decrease`.
939        set_user_rule("kotlin", Some(r"=>\s*$"), None, None, None, None);
940        let k = rules_for_id("kotlin").expect("kotlin via override");
941        assert!(
942            k.decrease.is_some(),
943            "decrease inherited from CurlyBrace family"
944        );
945        let c = "val f = x =>";
946        assert_eq!(k.calculate_indent(&buf(c), c.len(), 4, |_| true), 4);
947
948        clear_user_rules();
949        assert!(rules_for_id("zz_newlang").is_none(), "override cleared");
950        // kotlin falls back to its built-in CurlyBrace family rule.
951        assert!(rules_for_id("kotlin").is_some());
952    }
953}
954
955/// Parity guard: wherever the tree-sitter indenter is *authoritative*, the
956/// regex rules tier must produce the same indent. This is the safety net for
957/// moving "indent-only" languages off their tree-sitter grammars (design doc,
958/// phase 2): if a rule ever diverges from the AST result on the corpus, this
959/// fails before a grammar can be dropped.
960///
961/// Scope — curly-brace languages and Python only. These are the largest
962/// grammars (C# ~29 MB, C++/TS ~17 MB of generated source) and tree-sitter
963/// parses their block structure reliably even mid-edit, so it is a sound
964/// oracle. **Keyword-delimited families (Ruby/Lua/Bash/Pascal) are
965/// deliberately excluded**: on incomplete input — the normal "typed `def foo`
966/// and pressed Enter" case — tree-sitter cannot form a block node and the
967/// current editor already falls back to copy-the-line indent, so the rules
968/// tier (which indents correctly) is a strict *improvement*, not a regression.
969/// Those families are pinned by the golden unit tests above instead.
970///
971/// Cursor convention mirrors the real press-Enter moment: the buffer ends
972/// exactly where Enter is pressed (no trailing newline). Cases use clean code
973/// (no strings/comments holding stray delimiters), so the rules tier runs with
974/// masking disabled and the comparison is apples-to-apples. Cases where
975/// tree-sitter declines to decide (`None`) are skipped.
976#[cfg(all(test, feature = "tree-sitter"))]
977mod parity {
978    use super::*;
979    use crate::model::filesystem::NoopFileSystem;
980    use crate::primitives::indent::IndentCalculator;
981    use fresh_languages::Language;
982    use std::sync::Arc;
983
984    fn buf(content: &str) -> Buffer {
985        let fs = Arc::new(NoopFileSystem);
986        let mut b = Buffer::empty(fs);
987        b.insert(0, content);
988        b
989    }
990
991    #[test]
992    fn rules_match_tree_sitter_on_corpus() {
993        // (tree-sitter Language, rules id, code). Indent is taken at end-of-buffer,
994        // which is the cursor position when Enter is pressed.
995        //
996        // Only languages whose grammar is bundled can be compared against the
997        // tree-sitter oracle (the other grammars were removed entirely). The
998        // bundled curly-brace languages — Go, TypeScript, JavaScript — exercise
999        // the CurlyBrace family, which is the one shared by the removed
1000        // languages too, so this still guards the dropped languages' behavior.
1001        let cases: &[(Language, &str, &str)] = &[
1002            (Language::TypeScript, "typescript", "function f() {"),
1003            (Language::TypeScript, "typescript", "class A {"),
1004            (Language::TypeScript, "typescript", "let x = 1;"),
1005            (Language::Go, "go", "func main() {"),
1006            (Language::JavaScript, "javascript", "function f() {"),
1007        ];
1008
1009        let tab = 4;
1010        let mut mismatches = Vec::new();
1011        let mut compared = 0;
1012        for (lang, id, code) in cases {
1013            let ts = {
1014                let mut calc = IndentCalculator::new();
1015                calc.calculate_indent(&buf(code), code.len(), lang, tab)
1016            };
1017            let Some(ts) = ts else { continue }; // tree-sitter declined; skip
1018            compared += 1;
1019            let rules = rules_for_id(id)
1020                .unwrap_or_else(|| panic!("no rules for {id}"))
1021                .calculate_indent(&buf(code), code.len(), tab, |_| true);
1022            if ts != rules {
1023                mismatches.push(format!(
1024                    "  {id}: code={code:?} tree-sitter={ts} rules={rules}"
1025                ));
1026            }
1027        }
1028
1029        assert!(
1030            mismatches.is_empty(),
1031            "rules tier diverged from tree-sitter on {}/{} compared cases:\n{}",
1032            mismatches.len(),
1033            compared,
1034            mismatches.join("\n")
1035        );
1036        // Guard against the corpus silently going all-skips (e.g. an API change
1037        // making tree-sitter always return None) which would make this vacuous.
1038        assert!(
1039            compared >= 4,
1040            "too few comparable cases ({compared}); guard is vacuous"
1041        );
1042    }
1043}