fuzzy_parser/
sanitize.rs

1//! JSON sanitization for malformed LLM output
2//!
3//! This module provides pre-processing functions to fix common JSON syntax errors
4//! that LLMs often produce, making the JSON parseable before fuzzy repair.
5//!
6//! # Supported Fixes
7//!
8//! - **Trailing commas**: `{"a": 1,}` → `{"a": 1}`
9//! - **Missing closing braces**: `{"a": 1` → `{"a": 1}`
10//! - **Missing closing brackets**: `["a"` → `["a"]`
11//!
12//! # Example
13//!
14//! ```
15//! use fuzzy_parser::sanitize_json;
16//!
17//! // Fix trailing comma
18//! let input = r#"{"name": "test",}"#;
19//! let fixed = sanitize_json(input);
20//! assert_eq!(fixed, r#"{"name": "test"}"#);
21//!
22//! // Fix missing closing brace
23//! let input = r#"{"name": "test""#;
24//! let fixed = sanitize_json(input);
25//! assert_eq!(fixed, r#"{"name": "test"}"#);
26//!
27//! // Combined with fuzzy repair
28//! use fuzzy_parser::{repair_tagged_enum_json, TaggedEnumSchema, FuzzyOptions};
29//!
30//! let schema = TaggedEnumSchema::new("type", &["Action"], |_| Some(&["name"][..]));
31//! let malformed = r#"{"type": "Action", "name": "test",}"#;
32//!
33//! let sanitized = sanitize_json(malformed);
34//! let result = repair_tagged_enum_json(&sanitized, &schema, &FuzzyOptions::default()).unwrap();
35//! assert_eq!(result.repaired["name"], "test");
36//! ```
37//!
38//! # Design Notes
39//!
40//! This function performs **best-effort** sanitization. It handles common cases
41//! but does not attempt to fix all possible JSON errors. For severely malformed
42//! input, the result may still fail to parse.
43//!
44//! The function is designed to be:
45//! - **Safe**: Never produces worse output than input
46//! - **Fast**: Single-pass processing where possible
47//! - **Predictable**: Only fixes well-defined error patterns
48
49/// Sanitize malformed JSON string
50///
51/// Fixes common syntax errors that LLMs produce:
52/// - Trailing commas before `}` or `]`
53/// - Missing closing braces `}` or brackets `]`
54///
55/// # Arguments
56///
57/// * `input` - The potentially malformed JSON string
58///
59/// # Returns
60///
61/// A sanitized JSON string that may be parseable by serde_json.
62///
63/// # Examples
64///
65/// ```
66/// use fuzzy_parser::sanitize_json;
67///
68/// // Trailing comma in object
69/// assert_eq!(sanitize_json(r#"{"a": 1,}"#), r#"{"a": 1}"#);
70///
71/// // Trailing comma in array
72/// assert_eq!(sanitize_json(r#"[1, 2, 3,]"#), r#"[1, 2, 3]"#);
73///
74/// // Missing closing brace
75/// assert_eq!(sanitize_json(r#"{"a": 1"#), r#"{"a": 1}"#);
76///
77/// // Missing closing bracket
78/// assert_eq!(sanitize_json(r#"["a", "b""#), r#"["a", "b"]"#);
79///
80/// // Nested structures
81/// assert_eq!(
82///     sanitize_json(r#"{"items": [1, 2,], "name": "test",}"#),
83///     r#"{"items": [1, 2], "name": "test"}"#
84/// );
85///
86/// // Already valid JSON passes through unchanged
87/// assert_eq!(sanitize_json(r#"{"a": 1}"#), r#"{"a": 1}"#);
88/// ```
89pub fn sanitize_json(input: &str) -> String {
90    let trimmed = input.trim();
91    if trimmed.is_empty() {
92        return String::new();
93    }
94
95    // Step 1: Fix missing closing delimiters first
96    let with_delimiters = fix_missing_delimiters(trimmed);
97
98    // Step 2: Remove trailing commas (now that delimiters exist)
99    remove_trailing_commas(&with_delimiters)
100}
101
102/// Remove trailing commas before `}` or `]`
103///
104/// Handles commas inside strings correctly (does not remove them).
105fn remove_trailing_commas(input: &str) -> String {
106    let mut result = String::with_capacity(input.len());
107    let mut chars = input.chars().peekable();
108    let mut in_string = false;
109    let mut escape_next = false;
110
111    while let Some(c) = chars.next() {
112        if escape_next {
113            result.push(c);
114            escape_next = false;
115            continue;
116        }
117
118        match c {
119            '\\' if in_string => {
120                result.push(c);
121                escape_next = true;
122            }
123            '"' => {
124                in_string = !in_string;
125                result.push(c);
126            }
127            ',' if !in_string => {
128                // Look ahead to see if this comma is followed by } or ]
129                // Skip whitespace when looking ahead
130                let mut peek_iter = chars.clone();
131                let next_non_ws = loop {
132                    match peek_iter.next() {
133                        Some(ws) if ws.is_whitespace() => continue,
134                        other => break other,
135                    }
136                };
137
138                if matches!(next_non_ws, Some('}') | Some(']')) {
139                    // Skip this trailing comma
140                    continue;
141                }
142                result.push(c);
143            }
144            _ => {
145                result.push(c);
146            }
147        }
148    }
149
150    result
151}
152
153/// Fix missing closing braces `}` and brackets `]`
154///
155/// Counts unmatched opening delimiters and appends the necessary closing ones.
156fn fix_missing_delimiters(input: &str) -> String {
157    let mut result = String::from(input);
158    let mut in_string = false;
159    let mut escape_next = false;
160
161    // Stack to track opening delimiters: '{' or '['
162    let mut stack: Vec<char> = Vec::new();
163
164    for c in input.chars() {
165        if escape_next {
166            escape_next = false;
167            continue;
168        }
169
170        match c {
171            '\\' if in_string => {
172                escape_next = true;
173            }
174            '"' => {
175                in_string = !in_string;
176            }
177            '{' if !in_string => {
178                stack.push('{');
179            }
180            '[' if !in_string => {
181                stack.push('[');
182            }
183            '}' if !in_string => {
184                if let Some(&top) = stack.last() {
185                    if top == '{' {
186                        stack.pop();
187                    }
188                }
189            }
190            ']' if !in_string => {
191                if let Some(&top) = stack.last() {
192                    if top == '[' {
193                        stack.pop();
194                    }
195                }
196            }
197            _ => {}
198        }
199    }
200
201    // Close unclosed string if any
202    if in_string {
203        result.push('"');
204    }
205
206    // Append missing closing delimiters in reverse order
207    for &opener in stack.iter().rev() {
208        match opener {
209            '{' => result.push('}'),
210            '[' => result.push(']'),
211            _ => {}
212        }
213    }
214
215    result
216}
217
218#[cfg(test)]
219mod tests {
220    use super::*;
221
222    // =========================================================================
223    // Trailing Comma Tests
224    // =========================================================================
225
226    #[test]
227    fn test_trailing_comma_object() {
228        assert_eq!(sanitize_json(r#"{"a": 1,}"#), r#"{"a": 1}"#);
229    }
230
231    #[test]
232    fn test_trailing_comma_array() {
233        assert_eq!(sanitize_json(r#"[1, 2, 3,]"#), r#"[1, 2, 3]"#);
234    }
235
236    #[test]
237    fn test_trailing_comma_nested_object() {
238        assert_eq!(
239            sanitize_json(r#"{"outer": {"inner": 1,},}"#),
240            r#"{"outer": {"inner": 1}}"#
241        );
242    }
243
244    #[test]
245    fn test_trailing_comma_nested_array() {
246        assert_eq!(sanitize_json(r#"[[1, 2,], [3,],]"#), r#"[[1, 2], [3]]"#);
247    }
248
249    #[test]
250    fn test_trailing_comma_mixed() {
251        assert_eq!(
252            sanitize_json(r#"{"items": [1, 2,], "name": "test",}"#),
253            r#"{"items": [1, 2], "name": "test"}"#
254        );
255    }
256
257    #[test]
258    fn test_trailing_comma_with_whitespace() {
259        assert_eq!(sanitize_json(r#"{"a": 1 , }"#), r#"{"a": 1  }"#);
260        assert_eq!(sanitize_json("{\n  \"a\": 1,\n}"), "{\n  \"a\": 1\n}");
261    }
262
263    #[test]
264    fn test_comma_in_string_preserved() {
265        // Commas inside strings should NOT be removed
266        assert_eq!(
267            sanitize_json(r#"{"msg": "hello, world"}"#),
268            r#"{"msg": "hello, world"}"#
269        );
270        assert_eq!(sanitize_json(r#"{"msg": "a,}"}"#), r#"{"msg": "a,}"}"#);
271    }
272
273    #[test]
274    fn test_no_trailing_comma() {
275        assert_eq!(sanitize_json(r#"{"a": 1}"#), r#"{"a": 1}"#);
276        assert_eq!(sanitize_json(r#"[1, 2, 3]"#), r#"[1, 2, 3]"#);
277    }
278
279    // =========================================================================
280    // Missing Delimiter Tests
281    // =========================================================================
282
283    #[test]
284    fn test_missing_closing_brace() {
285        assert_eq!(sanitize_json(r#"{"a": 1"#), r#"{"a": 1}"#);
286    }
287
288    #[test]
289    fn test_missing_closing_bracket() {
290        assert_eq!(sanitize_json(r#"["a", "b""#), r#"["a", "b"]"#);
291    }
292
293    #[test]
294    fn test_missing_multiple_braces() {
295        assert_eq!(sanitize_json(r#"{"a": {"b": 1"#), r#"{"a": {"b": 1}}"#);
296    }
297
298    #[test]
299    fn test_missing_multiple_brackets() {
300        assert_eq!(sanitize_json(r#"[[1, 2], [3"#), r#"[[1, 2], [3]]"#);
301    }
302
303    #[test]
304    fn test_missing_mixed_delimiters() {
305        assert_eq!(sanitize_json(r#"{"items": [1, 2"#), r#"{"items": [1, 2]}"#);
306    }
307
308    #[test]
309    fn test_brace_in_string_ignored() {
310        // Braces inside strings should NOT be counted
311        assert_eq!(sanitize_json(r#"{"msg": "{"}"#), r#"{"msg": "{"}"#);
312    }
313
314    #[test]
315    fn test_no_missing_delimiters() {
316        assert_eq!(sanitize_json(r#"{"a": 1}"#), r#"{"a": 1}"#);
317        assert_eq!(sanitize_json(r#"[1, 2]"#), r#"[1, 2]"#);
318    }
319
320    // =========================================================================
321    // Combined Tests
322    // =========================================================================
323
324    #[test]
325    fn test_trailing_comma_and_missing_brace() {
326        assert_eq!(sanitize_json(r#"{"a": 1,"#), r#"{"a": 1}"#);
327    }
328
329    #[test]
330    fn test_trailing_comma_and_missing_bracket() {
331        assert_eq!(sanitize_json(r#"[1, 2,"#), r#"[1, 2]"#);
332    }
333
334    #[test]
335    fn test_complex_llm_output() {
336        let input = r#"{
337            "type": "AddDerive",
338            "target": "User",
339            "derives": ["Debug", "Clone",],
340        "#;
341        // Note: closing brace is appended directly (no formatting/indentation)
342        let expected = r#"{
343            "type": "AddDerive",
344            "target": "User",
345            "derives": ["Debug", "Clone"]}"#;
346        assert_eq!(sanitize_json(input), expected);
347    }
348
349    // =========================================================================
350    // Edge Cases
351    // =========================================================================
352
353    #[test]
354    fn test_empty_input() {
355        assert_eq!(sanitize_json(""), "");
356        assert_eq!(sanitize_json("   "), "");
357    }
358
359    #[test]
360    fn test_whitespace_only() {
361        assert_eq!(sanitize_json("  \n\t  "), "");
362    }
363
364    #[test]
365    fn test_simple_values() {
366        assert_eq!(sanitize_json("null"), "null");
367        assert_eq!(sanitize_json("true"), "true");
368        assert_eq!(sanitize_json("123"), "123");
369        assert_eq!(sanitize_json(r#""string""#), r#""string""#);
370    }
371
372    #[test]
373    fn test_escaped_quote_in_string() {
374        assert_eq!(
375            sanitize_json(r#"{"msg": "say \"hello\""}"#),
376            r#"{"msg": "say \"hello\""}"#
377        );
378    }
379
380    #[test]
381    fn test_escaped_backslash_in_string() {
382        assert_eq!(
383            sanitize_json(r#"{"path": "C:\\Users\\test"}"#),
384            r#"{"path": "C:\\Users\\test"}"#
385        );
386    }
387
388    #[test]
389    fn test_unclosed_string() {
390        // Unclosed string should be closed
391        assert_eq!(sanitize_json(r#"{"a": "test"#), r#"{"a": "test"}"#);
392    }
393
394    #[test]
395    fn test_deeply_nested() {
396        assert_eq!(
397            sanitize_json(r#"{"a": {"b": {"c": [1, 2,],"#),
398            r#"{"a": {"b": {"c": [1, 2]}}}"#
399        );
400    }
401
402    // =========================================================================
403    // Real-world LLM Output Examples
404    // =========================================================================
405
406    #[test]
407    fn test_llm_truncated_response() {
408        let input = r#"{"type": "RenameIdent", "from": "old_name", "to": "new_na"#;
409        let fixed = sanitize_json(input);
410        assert_eq!(
411            fixed,
412            r#"{"type": "RenameIdent", "from": "old_name", "to": "new_na"}"#
413        );
414    }
415
416    #[test]
417    fn test_llm_array_with_trailing_comma() {
418        let input = r#"{"intents": [
419            {"type": "AddDerive", "target": "User",},
420            {"type": "AddDerive", "target": "Post",},
421        ]}"#;
422        let fixed = sanitize_json(input);
423        assert!(fixed.contains(r#""target": "User"}"#));
424        assert!(fixed.contains(r#""target": "Post"}"#));
425        assert!(!fixed.contains(",}"));
426        assert!(!fixed.contains(",]"));
427    }
428}