Skip to main content

shape_ast/parser/
preprocessor.rs

1/// Automatic semicolon insertion preprocessor.
2///
3/// Resolves newline ambiguities where `[...]` or `(...)` on a new line could
4/// be parsed as index access or function call (Pest's greedy postfix matching).
5///
6/// Inserts `;` at the end of lines where:
7/// 1. The line ends with a statement-ending token (identifier char, `)`, `]`, `}`, `"`)
8/// 2. The next non-empty line starts with `[` or `(`
9///
10/// This mirrors Go's automatic semicolon insertion strategy.
11pub fn preprocess_semicolons(source: &str) -> String {
12    let lines: Vec<&str> = source.split('\n').collect();
13    if lines.len() <= 1 {
14        return source.to_string();
15    }
16
17    let mut result = String::with_capacity(source.len() + 64);
18    let mut in_block_comment = false;
19    let mut in_triple_string = false;
20
21    for i in 0..lines.len() {
22        let line = lines[i];
23
24        // Determine the effective last character on this line, skipping comments/strings
25        let last_char = effective_last_char(line, &mut in_block_comment, &mut in_triple_string);
26
27        let needs_semicolon = if let Some(ch) = last_char {
28            is_statement_ender(ch)
29                && next_nonblank_starts_with_bracket_or_paren(&lines, i + 1)
30        } else {
31            false
32        };
33
34        result.push_str(line);
35        if needs_semicolon {
36            result.push(';');
37        }
38        if i < lines.len() - 1 {
39            result.push('\n');
40        }
41    }
42
43    result
44}
45
46/// Returns the last significant (non-whitespace, non-comment) character on a line,
47/// while tracking block comment and triple-string state across lines.
48fn effective_last_char(
49    line: &str,
50    in_block_comment: &mut bool,
51    in_triple_string: &mut bool,
52) -> Option<char> {
53    let mut last_significant: Option<char> = None;
54    let mut in_simple_string = false;
55    let bytes = line.as_bytes();
56    let len = bytes.len();
57    let mut i = 0;
58
59    while i < len {
60        let ch = bytes[i] as char;
61
62        // Inside triple-quoted string — scan for closing """
63        if *in_triple_string {
64            if ch == '"' && i + 2 < len && bytes[i + 1] == b'"' && bytes[i + 2] == b'"' {
65                *in_triple_string = false;
66                last_significant = Some('"');
67                i += 3;
68            } else {
69                i += 1;
70            }
71            continue;
72        }
73
74        // Inside block comment — scan for closing */
75        if *in_block_comment {
76            if ch == '*' && i + 1 < len && bytes[i + 1] == b'/' {
77                *in_block_comment = false;
78                i += 2;
79            } else {
80                i += 1;
81            }
82            continue;
83        }
84
85        // Inside simple (single-line) string
86        if in_simple_string {
87            if ch == '\\' {
88                i += 2; // skip escaped char
89            } else if ch == '"' {
90                in_simple_string = false;
91                last_significant = Some('"');
92                i += 1;
93            } else {
94                i += 1;
95            }
96            continue;
97        }
98
99        // Not in any special context
100        match ch {
101            '"' => {
102                // Check for triple-quote opening
103                if i + 2 < len && bytes[i + 1] == b'"' && bytes[i + 2] == b'"' {
104                    *in_triple_string = true;
105                    i += 3;
106                } else {
107                    in_simple_string = true;
108                    i += 1;
109                }
110            }
111            '/' => {
112                if i + 1 < len && bytes[i + 1] == b'/' {
113                    // Line comment — rest of line is comment
114                    break;
115                } else if i + 1 < len && bytes[i + 1] == b'*' {
116                    *in_block_comment = true;
117                    i += 2;
118                } else {
119                    last_significant = Some(ch);
120                    i += 1;
121                }
122            }
123            _ => {
124                if !ch.is_ascii_whitespace() {
125                    last_significant = Some(ch);
126                }
127                i += 1;
128            }
129        }
130    }
131
132    last_significant
133}
134
135/// Whether a character at end-of-line indicates a complete statement.
136fn is_statement_ender(ch: char) -> bool {
137    ch.is_alphanumeric() || ch == '_' || ch == ')' || ch == ']' || ch == '}' || ch == '"'
138}
139
140/// Check if any of the next non-blank lines (starting at `from`) begins with `[` or `(`.
141fn next_nonblank_starts_with_bracket_or_paren(lines: &[&str], from: usize) -> bool {
142    for i in from..lines.len() {
143        let trimmed = lines[i].trim();
144        if !trimmed.is_empty() {
145            return trimmed.starts_with('[') || trimmed.starts_with('(');
146        }
147    }
148    false
149}
150
151#[cfg(test)]
152mod tests {
153    use super::*;
154
155    #[test]
156    fn test_insert_after_identifier_before_bracket() {
157        let input = "let x = foo\n[1, 2]";
158        let output = preprocess_semicolons(input);
159        assert_eq!(output, "let x = foo;\n[1, 2]");
160    }
161
162    #[test]
163    fn test_insert_after_paren_before_bracket() {
164        let input = "let m = HashMap().set(\"x\", None)\n[m.has(\"x\"), m.get(\"x\")]";
165        let output = preprocess_semicolons(input);
166        assert!(
167            output.contains(");\n["),
168            "should insert ; after closing paren"
169        );
170    }
171
172    #[test]
173    fn test_no_insert_when_line_ends_with_comma() {
174        let input = "foo(a,\n[1, 2])";
175        let output = preprocess_semicolons(input);
176        assert_eq!(output, input, "comma means continuation");
177    }
178
179    #[test]
180    fn test_no_insert_when_line_ends_with_dot() {
181        let input = "foo.\n[0]";
182        let output = preprocess_semicolons(input);
183        assert_eq!(output, input, "dot means method chain continuation");
184    }
185
186    #[test]
187    fn test_no_insert_when_next_line_not_bracket() {
188        let input = "let x = 5\nlet y = 10";
189        let output = preprocess_semicolons(input);
190        assert_eq!(output, input, "no bracket on next line");
191    }
192
193    #[test]
194    fn test_skips_blank_lines() {
195        let input = "let x = foo\n\n[1, 2]";
196        let output = preprocess_semicolons(input);
197        assert_eq!(output, "let x = foo;\n\n[1, 2]");
198    }
199
200    #[test]
201    fn test_line_comment_stripped() {
202        let input = "let x = foo // comment\n[1, 2]";
203        let output = preprocess_semicolons(input);
204        assert_eq!(output, "let x = foo // comment;\n[1, 2]");
205    }
206
207    #[test]
208    fn test_block_comment_tracked() {
209        // Line ends inside block comment — no insertion
210        let input = "let x = foo /* start\nend */ [1, 2]";
211        let output = preprocess_semicolons(input);
212        assert_eq!(output, input, "inside block comment");
213    }
214
215    #[test]
216    fn test_string_not_confused_with_comment() {
217        let input = "let x = \"//not a comment\"\n[1, 2]";
218        let output = preprocess_semicolons(input);
219        assert!(
220            output.contains("\";\n["),
221            "string ending with quote is a statement ender"
222        );
223    }
224
225    #[test]
226    fn test_closing_bracket_before_bracket() {
227        let input = "let a = [10, 20, 30]\n[a.first(), a.last()]";
228        let output = preprocess_semicolons(input);
229        assert!(output.contains("];\n["), "closing ] is a statement ender");
230    }
231
232    #[test]
233    fn test_closing_brace_before_bracket() {
234        let input = "let f = { x: 1 }\n[1, 2]";
235        let output = preprocess_semicolons(input);
236        assert!(output.contains("};\n["), "closing }} is a statement ender");
237    }
238
239    #[test]
240    fn test_no_insert_after_operator() {
241        let input = "let x = a +\n[1, 2]";
242        let output = preprocess_semicolons(input);
243        assert_eq!(output, input, "+ means expression continues");
244    }
245
246    #[test]
247    fn test_single_line_unchanged() {
248        let input = "let x = [1, 2, 3]";
249        let output = preprocess_semicolons(input);
250        assert_eq!(output, input);
251    }
252
253    #[test]
254    fn test_empty_input() {
255        assert_eq!(preprocess_semicolons(""), "");
256    }
257
258    #[test]
259    fn test_no_insert_inside_triple_string() {
260        // Content inside """ """ should not trigger insertion
261        let input = "let s = \"\"\"\nfoo\n[bar]\n\"\"\"\n[1, 2]";
262        let output = preprocess_semicolons(input);
263        // The semicolon should be inserted after the closing """ line, not inside
264        assert!(
265            output.contains("\"\"\";\n[1, 2]"),
266            "semicolon after triple string close, got: {}",
267            output
268        );
269        // No semicolon on the lines inside the triple string
270        assert!(
271            !output.contains("foo;\n"),
272            "no insertion inside triple string"
273        );
274    }
275
276    #[test]
277    fn test_insert_before_paren_on_new_line() {
278        let input = "let b = Pt { x: 10.0, y: 20.0 }\n(a + b).x";
279        let output = preprocess_semicolons(input);
280        assert_eq!(output, "let b = Pt { x: 10.0, y: 20.0 };\n(a + b).x");
281    }
282
283    #[test]
284    fn test_insert_before_paren_after_identifier() {
285        let input = "let dy = self.y2 - self.y1\n(dx * dx + dy * dy)";
286        let output = preprocess_semicolons(input);
287        assert_eq!(
288            output,
289            "let dy = self.y2 - self.y1;\n(dx * dx + dy * dy)"
290        );
291    }
292
293    #[test]
294    fn test_no_insert_before_paren_after_operator() {
295        let input = "let x = a +\n(b + c)";
296        let output = preprocess_semicolons(input);
297        assert_eq!(output, input, "+ means expression continues");
298    }
299
300    #[test]
301    fn test_no_insert_before_paren_after_comma() {
302        let input = "foo(a,\n(b + c))";
303        let output = preprocess_semicolons(input);
304        assert_eq!(output, input, "comma means continuation");
305    }
306
307    #[test]
308    fn test_no_insert_before_paren_after_equals() {
309        let input = "let x =\n(1 + 2)";
310        let output = preprocess_semicolons(input);
311        assert_eq!(output, input, "= means assignment continues");
312    }
313
314    #[test]
315    fn test_real_hashmap_pattern() {
316        let input = r#"let m = HashMap().set("x", None)
317[m.has("x"), m.get("x") == None]"#;
318        let expected = r#"let m = HashMap().set("x", None);
319[m.has("x"), m.get("x") == None]"#;
320        assert_eq!(preprocess_semicolons(input), expected);
321    }
322
323    // --- Multiline triple-string tests ---
324
325    #[test]
326    fn test_triple_string_multiline_with_bracket_inside() {
327        // A triple string spanning multiple lines with [ inside should not
328        // cause semicolon insertion anywhere inside the string.
329        let input = r#"let s = """
330    this has
331    [brackets inside]
332    the string
333"""
334let x = 5"#;
335        let output = preprocess_semicolons(input);
336        // No semicolons should be inserted inside the triple string
337        assert!(
338            !output.contains("has;\n"),
339            "no insertion inside triple string"
340        );
341        assert!(
342            !output.contains("inside];\n"),
343            "no insertion inside triple string before brackets"
344        );
345        assert_eq!(output, input, "no changes needed here");
346    }
347
348    #[test]
349    fn test_triple_string_ending_then_array_on_next_line() {
350        // Triple string on one line followed by array literal
351        let input = "let s = \"\"\"hello\"\"\"\n[1, 2]";
352        let output = preprocess_semicolons(input);
353        assert!(
354            output.contains("\"\"\";\n[1, 2]"),
355            "semicolon after triple string close before [, got: {}",
356            output
357        );
358    }
359
360    #[test]
361    fn test_triple_string_multiline_close_then_array() {
362        // Multi-line triple string where the closing """ is on its own line,
363        // followed by an array literal.
364        let input = "let s = \"\"\"\n    content\n    \"\"\"\n[1, 2]";
365        let output = preprocess_semicolons(input);
366        assert!(
367            output.contains("\"\"\";\n[1, 2]"),
368            "semicolon after closing triple-quote line, got: {}",
369            output
370        );
371    }
372
373    #[test]
374    fn test_triple_string_with_indented_bracket_lines() {
375        // Simulates the user's exact multiline string example followed by an array
376        let input = "let a_str = \"\"\"\n            this is\n            a multiline\n            string.\n              -it should indent\n            \"\"\"\n[a_str.length]";
377        let output = preprocess_semicolons(input);
378        // Semicolon only after the closing """
379        assert!(
380            output.contains("\"\"\";\n[a_str"),
381            "semicolon after triple string, got: {}",
382            output
383        );
384        // No semicolons inside the string content
385        assert!(!output.contains("is;\n"), "no insertion inside string");
386        assert!(!output.contains("indent;\n"), "no insertion inside string");
387    }
388
389    #[test]
390    fn test_formatted_triple_string_tracked() {
391        // f""" ... """ should also be tracked (the f prefix is before the quotes)
392        let input = "let s = f\"\"\"\n    value: {x}\n    [y]\n    \"\"\"\n[1, 2]";
393        let output = preprocess_semicolons(input);
394        // No insertion inside the formatted triple string
395        assert!(
396            !output.contains("{x};\n"),
397            "no insertion inside f-triple string"
398        );
399        // Semicolon after the closing """
400        assert!(
401            output.contains("\"\"\";\n[1, 2]"),
402            "semicolon after f-triple string close, got: {}",
403            output
404        );
405    }
406
407    #[test]
408    fn test_multiple_triple_strings_in_sequence() {
409        let input = "let a = \"\"\"\n  [inside a]\n  \"\"\"\nlet b = \"\"\"\n  [inside b]\n  \"\"\"\n[1, 2]";
410        let output = preprocess_semicolons(input);
411        // No insertion inside either triple string
412        assert!(
413            !output.contains("a];\n"),
414            "no insertion inside first string"
415        );
416        assert!(
417            !output.contains("b];\n"),
418            "no insertion inside second string"
419        );
420        // Semicolon before the final array
421        assert!(
422            output.contains("\"\"\";\n[1, 2]"),
423            "semicolon before final array, got: {}",
424            output
425        );
426    }
427}