Skip to main content

shape_ast/parser/
preprocessor.rs

1/// Automatic semicolon insertion preprocessor.
2///
3/// Resolves newline ambiguities where `[...]` or `(...)` on a new line could
4/// be parsed as index access or function call (Pest's greedy postfix matching).
5///
6/// Inserts `;` at the end of lines where:
7/// 1. The line ends with a statement-ending token (identifier char, `)`, `]`, `}`, `"`)
8/// 2. The next non-empty line starts with `[` or `(`
9///
10/// This mirrors Go's automatic semicolon insertion strategy.
11pub fn preprocess_semicolons(source: &str) -> String {
12    let lines: Vec<&str> = source.split('\n').collect();
13    if lines.len() <= 1 {
14        return source.to_string();
15    }
16
17    let mut result = String::with_capacity(source.len() + 64);
18    let mut in_block_comment = false;
19    let mut in_triple_string = false;
20
21    for i in 0..lines.len() {
22        let line = lines[i];
23
24        // Determine the effective last character on this line, skipping comments/strings
25        let last_char = effective_last_char(line, &mut in_block_comment, &mut in_triple_string);
26
27        let needs_semicolon = if let Some(ch) = last_char {
28            is_statement_ender(ch) && next_nonblank_starts_with_bracket_or_paren(&lines, i + 1)
29        } else {
30            false
31        };
32
33        result.push_str(line);
34        if needs_semicolon {
35            result.push(';');
36        }
37        if i < lines.len() - 1 {
38            result.push('\n');
39        }
40    }
41
42    result
43}
44
45/// Returns the last significant (non-whitespace, non-comment) character on a line,
46/// while tracking block comment and triple-string state across lines.
47fn effective_last_char(
48    line: &str,
49    in_block_comment: &mut bool,
50    in_triple_string: &mut bool,
51) -> Option<char> {
52    let mut last_significant: Option<char> = None;
53    let mut in_simple_string = false;
54    let bytes = line.as_bytes();
55    let len = bytes.len();
56    let mut i = 0;
57
58    while i < len {
59        let ch = bytes[i] as char;
60
61        // Inside triple-quoted string — scan for closing """
62        if *in_triple_string {
63            if ch == '"' && i + 2 < len && bytes[i + 1] == b'"' && bytes[i + 2] == b'"' {
64                *in_triple_string = false;
65                last_significant = Some('"');
66                i += 3;
67            } else {
68                i += 1;
69            }
70            continue;
71        }
72
73        // Inside block comment — scan for closing */
74        if *in_block_comment {
75            if ch == '*' && i + 1 < len && bytes[i + 1] == b'/' {
76                *in_block_comment = false;
77                i += 2;
78            } else {
79                i += 1;
80            }
81            continue;
82        }
83
84        // Inside simple (single-line) string
85        if in_simple_string {
86            if ch == '\\' {
87                i += 2; // skip escaped char
88            } else if ch == '"' {
89                in_simple_string = false;
90                last_significant = Some('"');
91                i += 1;
92            } else {
93                i += 1;
94            }
95            continue;
96        }
97
98        // Not in any special context
99        match ch {
100            '"' => {
101                // Check for triple-quote opening
102                if i + 2 < len && bytes[i + 1] == b'"' && bytes[i + 2] == b'"' {
103                    *in_triple_string = true;
104                    i += 3;
105                } else {
106                    in_simple_string = true;
107                    i += 1;
108                }
109            }
110            '/' => {
111                if i + 1 < len && bytes[i + 1] == b'/' {
112                    // Line comment — rest of line is comment
113                    break;
114                } else if i + 1 < len && bytes[i + 1] == b'*' {
115                    *in_block_comment = true;
116                    i += 2;
117                } else {
118                    last_significant = Some(ch);
119                    i += 1;
120                }
121            }
122            _ => {
123                if !ch.is_ascii_whitespace() {
124                    last_significant = Some(ch);
125                }
126                i += 1;
127            }
128        }
129    }
130
131    last_significant
132}
133
134/// Whether a character at end-of-line indicates a complete statement.
135fn is_statement_ender(ch: char) -> bool {
136    ch.is_alphanumeric() || ch == '_' || ch == ')' || ch == ']' || ch == '}' || ch == '"'
137}
138
139/// Check if any of the next non-blank lines (starting at `from`) begins with `[` or `(`.
140fn next_nonblank_starts_with_bracket_or_paren(lines: &[&str], from: usize) -> bool {
141    for i in from..lines.len() {
142        let trimmed = lines[i].trim();
143        if !trimmed.is_empty() {
144            return trimmed.starts_with('[') || trimmed.starts_with('(');
145        }
146    }
147    false
148}
149
150#[cfg(test)]
151mod tests {
152    use super::*;
153
154    #[test]
155    fn test_insert_after_identifier_before_bracket() {
156        let input = "let x = foo\n[1, 2]";
157        let output = preprocess_semicolons(input);
158        assert_eq!(output, "let x = foo;\n[1, 2]");
159    }
160
161    #[test]
162    fn test_insert_after_paren_before_bracket() {
163        let input = "let m = HashMap().set(\"x\", None)\n[m.has(\"x\"), m.get(\"x\")]";
164        let output = preprocess_semicolons(input);
165        assert!(
166            output.contains(");\n["),
167            "should insert ; after closing paren"
168        );
169    }
170
171    #[test]
172    fn test_no_insert_when_line_ends_with_comma() {
173        let input = "foo(a,\n[1, 2])";
174        let output = preprocess_semicolons(input);
175        assert_eq!(output, input, "comma means continuation");
176    }
177
178    #[test]
179    fn test_no_insert_when_line_ends_with_dot() {
180        let input = "foo.\n[0]";
181        let output = preprocess_semicolons(input);
182        assert_eq!(output, input, "dot means method chain continuation");
183    }
184
185    #[test]
186    fn test_no_insert_when_next_line_not_bracket() {
187        let input = "let x = 5\nlet y = 10";
188        let output = preprocess_semicolons(input);
189        assert_eq!(output, input, "no bracket on next line");
190    }
191
192    #[test]
193    fn test_skips_blank_lines() {
194        let input = "let x = foo\n\n[1, 2]";
195        let output = preprocess_semicolons(input);
196        assert_eq!(output, "let x = foo;\n\n[1, 2]");
197    }
198
199    #[test]
200    fn test_line_comment_stripped() {
201        let input = "let x = foo // comment\n[1, 2]";
202        let output = preprocess_semicolons(input);
203        assert_eq!(output, "let x = foo // comment;\n[1, 2]");
204    }
205
206    #[test]
207    fn test_block_comment_tracked() {
208        // Line ends inside block comment — no insertion
209        let input = "let x = foo /* start\nend */ [1, 2]";
210        let output = preprocess_semicolons(input);
211        assert_eq!(output, input, "inside block comment");
212    }
213
214    #[test]
215    fn test_string_not_confused_with_comment() {
216        let input = "let x = \"//not a comment\"\n[1, 2]";
217        let output = preprocess_semicolons(input);
218        assert!(
219            output.contains("\";\n["),
220            "string ending with quote is a statement ender"
221        );
222    }
223
224    #[test]
225    fn test_closing_bracket_before_bracket() {
226        let input = "let a = [10, 20, 30]\n[a.first(), a.last()]";
227        let output = preprocess_semicolons(input);
228        assert!(output.contains("];\n["), "closing ] is a statement ender");
229    }
230
231    #[test]
232    fn test_closing_brace_before_bracket() {
233        let input = "let f = { x: 1 }\n[1, 2]";
234        let output = preprocess_semicolons(input);
235        assert!(output.contains("};\n["), "closing }} is a statement ender");
236    }
237
238    #[test]
239    fn test_no_insert_after_operator() {
240        let input = "let x = a +\n[1, 2]";
241        let output = preprocess_semicolons(input);
242        assert_eq!(output, input, "+ means expression continues");
243    }
244
245    #[test]
246    fn test_single_line_unchanged() {
247        let input = "let x = [1, 2, 3]";
248        let output = preprocess_semicolons(input);
249        assert_eq!(output, input);
250    }
251
252    #[test]
253    fn test_empty_input() {
254        assert_eq!(preprocess_semicolons(""), "");
255    }
256
257    #[test]
258    fn test_no_insert_inside_triple_string() {
259        // Content inside """ """ should not trigger insertion
260        let input = "let s = \"\"\"\nfoo\n[bar]\n\"\"\"\n[1, 2]";
261        let output = preprocess_semicolons(input);
262        // The semicolon should be inserted after the closing """ line, not inside
263        assert!(
264            output.contains("\"\"\";\n[1, 2]"),
265            "semicolon after triple string close, got: {}",
266            output
267        );
268        // No semicolon on the lines inside the triple string
269        assert!(
270            !output.contains("foo;\n"),
271            "no insertion inside triple string"
272        );
273    }
274
275    #[test]
276    fn test_insert_before_paren_on_new_line() {
277        let input = "let b = Pt { x: 10.0, y: 20.0 }\n(a + b).x";
278        let output = preprocess_semicolons(input);
279        assert_eq!(output, "let b = Pt { x: 10.0, y: 20.0 };\n(a + b).x");
280    }
281
282    #[test]
283    fn test_insert_before_paren_after_identifier() {
284        let input = "let dy = self.y2 - self.y1\n(dx * dx + dy * dy)";
285        let output = preprocess_semicolons(input);
286        assert_eq!(output, "let dy = self.y2 - self.y1;\n(dx * dx + dy * dy)");
287    }
288
289    #[test]
290    fn test_no_insert_before_paren_after_operator() {
291        let input = "let x = a +\n(b + c)";
292        let output = preprocess_semicolons(input);
293        assert_eq!(output, input, "+ means expression continues");
294    }
295
296    #[test]
297    fn test_no_insert_before_paren_after_comma() {
298        let input = "foo(a,\n(b + c))";
299        let output = preprocess_semicolons(input);
300        assert_eq!(output, input, "comma means continuation");
301    }
302
303    #[test]
304    fn test_no_insert_before_paren_after_equals() {
305        let input = "let x =\n(1 + 2)";
306        let output = preprocess_semicolons(input);
307        assert_eq!(output, input, "= means assignment continues");
308    }
309
310    #[test]
311    fn test_real_hashmap_pattern() {
312        let input = r#"let m = HashMap().set("x", None)
313[m.has("x"), m.get("x") == None]"#;
314        let expected = r#"let m = HashMap().set("x", None);
315[m.has("x"), m.get("x") == None]"#;
316        assert_eq!(preprocess_semicolons(input), expected);
317    }
318
319    // --- Multiline triple-string tests ---
320
321    #[test]
322    fn test_triple_string_multiline_with_bracket_inside() {
323        // A triple string spanning multiple lines with [ inside should not
324        // cause semicolon insertion anywhere inside the string.
325        let input = r#"let s = """
326    this has
327    [brackets inside]
328    the string
329"""
330let x = 5"#;
331        let output = preprocess_semicolons(input);
332        // No semicolons should be inserted inside the triple string
333        assert!(
334            !output.contains("has;\n"),
335            "no insertion inside triple string"
336        );
337        assert!(
338            !output.contains("inside];\n"),
339            "no insertion inside triple string before brackets"
340        );
341        assert_eq!(output, input, "no changes needed here");
342    }
343
344    #[test]
345    fn test_triple_string_ending_then_array_on_next_line() {
346        // Triple string on one line followed by array literal
347        let input = "let s = \"\"\"hello\"\"\"\n[1, 2]";
348        let output = preprocess_semicolons(input);
349        assert!(
350            output.contains("\"\"\";\n[1, 2]"),
351            "semicolon after triple string close before [, got: {}",
352            output
353        );
354    }
355
356    #[test]
357    fn test_triple_string_multiline_close_then_array() {
358        // Multi-line triple string where the closing """ is on its own line,
359        // followed by an array literal.
360        let input = "let s = \"\"\"\n    content\n    \"\"\"\n[1, 2]";
361        let output = preprocess_semicolons(input);
362        assert!(
363            output.contains("\"\"\";\n[1, 2]"),
364            "semicolon after closing triple-quote line, got: {}",
365            output
366        );
367    }
368
369    #[test]
370    fn test_triple_string_with_indented_bracket_lines() {
371        // Simulates the user's exact multiline string example followed by an array
372        let input = "let a_str = \"\"\"\n            this is\n            a multiline\n            string.\n              -it should indent\n            \"\"\"\n[a_str.length]";
373        let output = preprocess_semicolons(input);
374        // Semicolon only after the closing """
375        assert!(
376            output.contains("\"\"\";\n[a_str"),
377            "semicolon after triple string, got: {}",
378            output
379        );
380        // No semicolons inside the string content
381        assert!(!output.contains("is;\n"), "no insertion inside string");
382        assert!(!output.contains("indent;\n"), "no insertion inside string");
383    }
384
385    #[test]
386    fn test_formatted_triple_string_tracked() {
387        // f""" ... """ should also be tracked (the f prefix is before the quotes)
388        let input = "let s = f\"\"\"\n    value: {x}\n    [y]\n    \"\"\"\n[1, 2]";
389        let output = preprocess_semicolons(input);
390        // No insertion inside the formatted triple string
391        assert!(
392            !output.contains("{x};\n"),
393            "no insertion inside f-triple string"
394        );
395        // Semicolon after the closing """
396        assert!(
397            output.contains("\"\"\";\n[1, 2]"),
398            "semicolon after f-triple string close, got: {}",
399            output
400        );
401    }
402
403    #[test]
404    fn test_multiple_triple_strings_in_sequence() {
405        let input = "let a = \"\"\"\n  [inside a]\n  \"\"\"\nlet b = \"\"\"\n  [inside b]\n  \"\"\"\n[1, 2]";
406        let output = preprocess_semicolons(input);
407        // No insertion inside either triple string
408        assert!(
409            !output.contains("a];\n"),
410            "no insertion inside first string"
411        );
412        assert!(
413            !output.contains("b];\n"),
414            "no insertion inside second string"
415        );
416        // Semicolon before the final array
417        assert!(
418            output.contains("\"\"\";\n[1, 2]"),
419            "semicolon before final array, got: {}",
420            output
421        );
422    }
423}