json_fix/
fixer.rs

1use fancy_regex::{Captures, Regex};
2use serde_json;
3
4/// Robust JSON fixer for malformed AI or chatbot output.
5/// Designed to recover from common formatting issues in JSON-like text.
6#[derive(Debug)]
7pub struct FixReport {
8    pub original: String,
9    pub fixed: String,
10    pub steps: Vec<String>,
11    pub success: bool,
12}
13
14#[derive(Debug)]
15enum FixStep {
16    RemoveEscapedQuoteComma,
17    // ... (other steps could be added here)
18}
19
20fn apply_step<F: FnOnce(String) -> String>(input: String, _step: FixStep, f: F) -> String {
21    f(input)
22}
23
24pub fn fix_json_syntax(input: &str) -> FixReport {
25    let mut steps = Vec::new();
26    let mut fixed = input.trim().to_string();
27
28    // 1. Strip markdown wrappers or triple quotes
29    if fixed.starts_with("```json") || fixed.starts_with("```") {
30        fixed = fixed
31            .trim_start_matches("```json")
32            .trim_start_matches("```")
33            .trim_end_matches("```")
34            .trim()
35            .to_string();
36        steps.push("Stripped markdown wrappers or triple quotes".to_string());
37    }
38
39    // 2. Fix missing commas between fields (e.g. "title": "x" "body": "y")
40    let re_missing_commas = Regex::new(r#"(\"[^\"]+\"\s*:\s*\"[^\"]+\")\s+\""#).unwrap();
41    let new_fixed = re_missing_commas.replace_all(&fixed, "$1,\n\"").to_string();
42    if new_fixed != fixed {
43        fixed = new_fixed;
44        steps.push("Fixed missing commas between fields".to_string());
45    }
46
47    // 3. Fix adjacent object blocks
48    let new_fixed = fixed.replace("}\n{", "},\n{");
49    if new_fixed != fixed {
50        fixed = new_fixed;
51        steps.push("Fixed adjacent object blocks".to_string());
52    }
53
54    // 4. Fix generic missing commas between quoted values
55    let re_adjacent_quoted = Regex::new(r#""\s+""#).unwrap();
56    let new_fixed = re_adjacent_quoted
57        .replace_all(&fixed, "\",\n\"")
58        .to_string();
59    if new_fixed != fixed {
60        fixed = new_fixed;
61        steps.push("Fixed generic missing commas between quoted values".to_string());
62    }
63
64    // 5. Remove trailing commas in arrays or objects
65    let re_trailing_commas = Regex::new(r",\s*([\]}])").unwrap();
66    let new_fixed = re_trailing_commas.replace_all(&fixed, "$1").to_string();
67    if new_fixed != fixed {
68        fixed = new_fixed;
69        steps.push("Removed trailing commas".to_string());
70    }
71
72    // 6. Convert single quotes to double quotes (if outside word boundaries)
73    let re_single_quotes = Regex::new(r"'([^']*)'").unwrap();
74    let new_fixed = re_single_quotes.replace_all(&fixed, "\"$1\"").to_string();
75    if new_fixed != fixed {
76        fixed = new_fixed;
77        steps.push("Converted single quotes to double quotes".to_string());
78    }
79
80    // 7. Convert curly quotes and weird symbols
81    let new_fixed = fixed
82        .replace('“', "\"")
83        .replace('”', "\"")
84        .replace('‘', "'")
85        .replace('’', "'");
86    if new_fixed != fixed {
87        fixed = new_fixed;
88        steps.push("Converted curly quotes and weird symbols".to_string());
89    }
90
91    // 9 and 8. Fix broken contractions and apostrophes.
92    // Note: Step 9 must come before Step 8 to avoid conflicts.
93    // Step 9 fixes contractions written with double quotes instead of apostrophes (e.g. I"m → I'm)
94    let re_broken_contractions = Regex::new(r#"(\b\w+)"(\w+)"#).unwrap();
95    let new_fixed = re_broken_contractions
96        .replace_all(&fixed, "$1'$2")
97        .to_string();
98    if new_fixed != fixed {
99        fixed = new_fixed;
100        steps.push("Fixed broken contractions written with double quotes".to_string());
101    }
102
103    // Step 8 fixes broken apostrophes written as quotes (e.g., it"s → it's)
104    let re_broken_apostrophes = Regex::new(r#"(\w)"([sdmt])\b"#).unwrap();
105    let new_fixed = re_broken_apostrophes
106        .replace_all(&fixed, "$1'$2")
107        .to_string();
108    if new_fixed != fixed {
109        fixed = new_fixed;
110        steps.push("Fixed broken apostrophes written as quotes".to_string());
111    }
112
113    // 10. Handle escaped stringified JSON
114    if fixed.starts_with('\"') && fixed.ends_with('\"') {
115        if let Ok(unescaped) = serde_json::from_str::<String>(&fixed) {
116            if unescaped != fixed {
117                fixed = unescaped;
118                steps.push("Handled escaped stringified JSON".to_string());
119            }
120        }
121    }
122
123    // 11. Quote unquoted keys (e.g. name: "John" → "name": "John")
124    let re_unquoted_keys = Regex::new(r#"(?m)(^|[{,\s])(\w+)(\s*:\s*)""#).unwrap();
125    let new_fixed = re_unquoted_keys
126        .replace_all(&fixed, "$1\"$2\"$3")
127        .to_string();
128    if new_fixed != fixed {
129        fixed = new_fixed;
130        steps.push("Quoted unquoted keys".to_string());
131    }
132
133    // 12. Escape unescaped inner double quotes within values
134    // Note: This step escapes inner quotes inside string values.
135    // Step 14.6 also escapes inner quotes, so order and pattern safeguards are important to avoid double escaping.
136    let re_unescaped_inner_quotes = Regex::new(r#":\s*"([^"]*?)"([^\\"][^"]*?)""#).unwrap();
137    let new_fixed = re_unescaped_inner_quotes
138        .replace_all(&fixed, r#": "$1\"$2""#)
139        .to_string();
140    if new_fixed != fixed {
141        fixed = new_fixed;
142        steps.push("Escaped unescaped inner double quotes within values".to_string());
143    }
144
145    // 13. Remove invalid escape sequences
146    let re_invalid_escapes = Regex::new(r#"\\[^"\\/bfnrt]"#).unwrap();
147    let new_fixed = re_invalid_escapes.replace_all(&fixed, "").to_string();
148    if new_fixed != fixed {
149        fixed = new_fixed;
150        steps.push("Removed invalid escape sequences".to_string());
151    }
152
153    // 14. Convert raw newlines in strings to \n
154    // Fixed closure signature to use fancy_regex::Captures as required by fancy_regex crate.
155    let re_multiline_strings = Regex::new(r#""([^"]*?)\n([^"]*?)""#).unwrap();
156    let new_fixed = re_multiline_strings
157        .replace_all(&fixed, |caps: &Captures| {
158            let first = &caps[1].replace('\n', "\\n");
159            let second = &caps[2].replace('\n', "\\n");
160            format!("\"{}\\n{}\"", first, second)
161        })
162        .to_string();
163    if new_fixed != fixed {
164        fixed = new_fixed;
165        steps.push("Converted raw newlines in strings to \\n".to_string());
166    }
167
168    // 14.5: Generic embedded key fixer – fixes when a second key gets trapped inside a string value
169    let re_embedded_key_start = Regex::new(
170        r#""(?P<key1>\w+)"\s*:\s*"(?P<val>[^"]*?),\s*\\?"(?P<key2>\w+)"\s*:\s*(?P<val2>[^"{}\[\],]+)"#
171    ).unwrap();
172
173    let new_fixed = re_embedded_key_start
174        .replace_all(&fixed, |caps: &Captures| {
175            let key1 = &caps["key1"];
176            let val = &caps["val"];
177            let key2 = &caps["key2"];
178            let val2 = &caps["val2"];
179            format!(r#""{}": "{}", "{}": {}"#, key1, val.trim(), key2, val2)
180        })
181        .to_string();
182
183    if new_fixed != fixed {
184        fixed = new_fixed;
185        steps.push("Fixed embedded key-value pair trapped inside string".to_string());
186    }
187
188    // 14.45. Fix embedded key start inside unescaped value (e.g., "emotion": "hopeful, "score": 80)
189    let re_embedded_key_start = Regex::new(
190        r#""(?P<key1>\w+)"\s*:\s*"(?P<val>[^"]*?),\s*"(?P<key2>\w+)"\s*:\s*(?P<val2>[^"{}\[\],]+)"#,
191    )
192    .unwrap();
193
194    let new_fixed = re_embedded_key_start
195        .replace_all(&fixed, |caps: &Captures| {
196            let key1 = &caps["key1"];
197            let val = &caps["val"];
198            let key2 = &caps["key2"];
199            let val2 = &caps["val2"];
200            format!(
201                r#""{}": "{}", "{}": {}"#,
202                key1,
203                val.trim(),
204                key2,
205                val2.trim()
206            )
207        })
208        .to_string();
209    if new_fixed != fixed {
210        fixed = new_fixed;
211        steps.push("Fixed embedded key start inside unescaped value".to_string());
212    }
213
214    // 14.55. Fix misescaped internal quote sequences (e.g., "text with \"some quote" → "text with some quote")
215    let re_misescaped = Regex::new(r#"(?P<key>:\s*")(?P<val>[^"]*?)\\",\s*(?P<rest>")"#).unwrap();
216    let new_fixed = re_misescaped
217        .replace_all(&fixed, "${key}${val}, ${rest}")
218        .to_string();
219    if new_fixed != fixed {
220        fixed = new_fixed;
221        steps.push("Fixed misescaped internal quote sequences".to_string());
222    }
223
224    // 14.6. Escape unescaped double quotes inside string values (e.g., "... "word" ...")
225    // This step is after step 12 to avoid double escaping.
226    // The regex is designed to avoid matching already escaped quotes.
227    let re_inner_unescaped_quotes =
228        Regex::new(r#"(".*?:\s*")((?:[^"\\]|\\.)*?)"((?:[^"\\]|\\.)*?)""#).unwrap();
229    let new_fixed = re_inner_unescaped_quotes
230        .replace_all(&fixed, "$1$2\\\"$3\"")
231        .to_string();
232    if new_fixed != fixed {
233        fixed = new_fixed;
234        steps.push("Escaped unescaped double quotes inside string values".to_string());
235    }
236
237    // 15. Auto-fix dangling or mismatched brackets/braces using a shallow stack matcher
238    let mut stack = vec![];
239    let mut cleaned = String::new();
240    for c in fixed.chars() {
241        match c {
242            '{' | '[' => {
243                stack.push(c);
244                cleaned.push(c);
245            }
246            '}' => {
247                if stack.last() == Some(&'{') {
248                    stack.pop();
249                    cleaned.push('}');
250                }
251                // else skip unmatched }
252            }
253            ']' => {
254                if stack.last() == Some(&'[') {
255                    stack.pop();
256                    cleaned.push(']');
257                }
258                // else skip unmatched ]
259            }
260            _ => cleaned.push(c),
261        }
262    }
263    // Auto-close any remaining opened delimiters
264    while let Some(c) = stack.pop() {
265        match c {
266            '{' => cleaned.push('}'),
267            '[' => cleaned.push(']'),
268            _ => {}
269        }
270    }
271    if cleaned != fixed {
272        fixed = cleaned;
273        steps.push("Auto-fixed dangling or mismatched brackets/braces".to_string());
274    }
275
276    // Normalize line breaks between quoted array elements (e.g. from GPT)
277    let re_stray_array_linebreaks = Regex::new(r#"(\"\s*),\s*\\n\s*(\")"#).unwrap();
278    let new_fixed = re_stray_array_linebreaks
279        .replace_all(&fixed, "$1, $2")
280        .to_string();
281    if new_fixed != fixed {
282        fixed = new_fixed;
283        steps.push("Normalized line breaks between quoted array elements".to_string());
284    }
285
286    // Also normalize overly escaped array strings with embedded linebreaks
287    let re_array_line_merger = Regex::new(r#"\",\s*\\n\s*\""#).unwrap();
288    let new_fixed = re_array_line_merger
289        .replace_all(&fixed, "\", \"")
290        .to_string();
291    if new_fixed != fixed {
292        fixed = new_fixed;
293        steps.push("Normalized overly escaped array strings with embedded linebreaks".to_string());
294    }
295
296    // Normalize line breaks between array items
297    let re_linebreaks_between_items = Regex::new(r#"\",\s*\n\s*\""#).unwrap();
298    let new_fixed = re_linebreaks_between_items
299        .replace_all(&fixed, "\", \"")
300        .to_string();
301    if new_fixed != fixed {
302        fixed = new_fixed;
303        steps.push("Normalized line breaks between array items".to_string());
304    }
305
306    // 15.9. Remove stray escaped quote-comma sequences (e.g. \"\, → \",)
307    fixed = apply_step(fixed, FixStep::RemoveEscapedQuoteComma, |s| {
308        s.replace("\\\",", "\",")
309    });
310
311    // 16. Final clean-up: attempt full JSON parse and re-serialize if possible
312    // Moved to the end after all mutations, including bracket fixing, for best recovery chance.
313    let success = if let Ok(val) = serde_json::from_str::<serde_json::Value>(&fixed) {
314        if let Ok(re) = serde_json::to_string_pretty(&val) {
315            fixed = re;
316            true
317        } else {
318            false
319        }
320    } else {
321        if let Err(e) = serde_json::from_str::<serde_json::Value>(&fixed) {
322            println!("❌ Final JSON parse error: {}", e);
323            let line = e.line();
324            let column = e.column();
325            println!("📍 Error occurred at line {}, column {}", line, column);
326            println!("📍 Faulty fixed JSON:\n{:#?}", fixed);
327        }
328        false
329    };
330
331    FixReport {
332        original: input.to_string(),
333        fixed,
334        steps,
335        success,
336    }
337}