Skip to main content

rustia/
lenient_json.rs

1use std::borrow::Cow;
2
3use serde_json::{Map, Number, Value};
4
5use crate::{LlmJsonParseError, LlmJsonParseResult};
6
7const MAX_DEPTH: usize = 512;
8
9pub fn parse_lenient_json_value(input: &str) -> LlmJsonParseResult<Value> {
10    if let Ok(data) = serde_json::from_str::<Value>(input) {
11        return LlmJsonParseResult::Success { data };
12    }
13    iterate(input)
14}
15
16fn iterate(input: &str) -> LlmJsonParseResult<Value> {
17    let source = match extract_markdown_code_block(input) {
18        Some(content) => Cow::Owned(content),
19        None => Cow::Borrowed(input),
20    };
21    let json_source = source.as_ref();
22    let trimmed = json_source.trim();
23
24    if trimmed.is_empty() {
25        return LlmJsonParseResult::Failure {
26            data: None,
27            input: input.to_owned(),
28            errors: vec![LlmJsonParseError {
29                path: "$input".to_owned(),
30                expected: "JSON value".to_owned(),
31                description: "empty input".to_owned(),
32            }],
33        };
34    }
35
36    let parse_source = if starts_with_primitive(trimmed) {
37        json_source
38    } else if let Some(json_start) = find_json_start(json_source) {
39        &json_source[json_start..]
40    } else {
41        let skipped = skip_comments_and_whitespace(json_source);
42        if skipped.is_empty() || !starts_with_primitive(skipped) {
43            return LlmJsonParseResult::Failure {
44                data: None,
45                input: input.to_owned(),
46                errors: vec![LlmJsonParseError {
47                    path: "$input".to_owned(),
48                    expected: "JSON value".to_owned(),
49                    description: json_source.to_owned(),
50                }],
51            };
52        }
53        skipped
54    };
55
56    let mut errors = Vec::new();
57    let mut parser = LenientJsonParser::new(parse_source, &mut errors);
58    let data = parser.parse();
59
60    if errors.is_empty() {
61        if let Some(value) = data {
62            LlmJsonParseResult::Success { data: value }
63        } else {
64            LlmJsonParseResult::Failure {
65                data: None,
66                input: input.to_owned(),
67                errors: vec![LlmJsonParseError {
68                    path: "$input".to_owned(),
69                    expected: "JSON value".to_owned(),
70                    description: "unable to parse input".to_owned(),
71                }],
72            }
73        }
74    } else {
75        LlmJsonParseResult::Failure {
76            data,
77            input: input.to_owned(),
78            errors,
79        }
80    }
81}
82
83struct LenientJsonParser<'a> {
84    chars: Vec<char>,
85    pos: usize,
86    depth: usize,
87    errors: &'a mut Vec<LlmJsonParseError>,
88}
89
90impl<'a> LenientJsonParser<'a> {
91    fn new(input: &str, errors: &'a mut Vec<LlmJsonParseError>) -> Self {
92        Self {
93            chars: input.chars().collect(),
94            pos: 0,
95            depth: 0,
96            errors,
97        }
98    }
99
100    fn parse(&mut self) -> Option<Value> {
101        self.skip_whitespace();
102        if self.pos >= self.chars.len() {
103            return None;
104        }
105        self.parse_value("$input")
106    }
107
108    fn parse_value(&mut self, path: &str) -> Option<Value> {
109        self.skip_whitespace();
110
111        if self.pos >= self.chars.len() {
112            return None;
113        }
114
115        if self.depth >= MAX_DEPTH {
116            self.errors.push(LlmJsonParseError {
117                path: path.to_owned(),
118                expected: "value (max depth exceeded)".to_owned(),
119                description: "maximum parser nesting depth exceeded".to_owned(),
120            });
121            return None;
122        }
123
124        match self.current_char() {
125            Some('{') => self.parse_object(path),
126            Some('[') => self.parse_array(path),
127            Some('"') => Some(Value::String(self.parse_string(path))),
128            Some('-') => Some(Value::Number(self.parse_number())),
129            Some(ch) if ch.is_ascii_digit() => Some(Value::Number(self.parse_number())),
130            Some(ch) if is_identifier_start(ch) => self.parse_keyword_or_identifier(path),
131            Some('}') | Some(']') | Some(',') => None,
132            Some(_) => {
133                self.errors.push(LlmJsonParseError {
134                    path: path.to_owned(),
135                    expected: "JSON value (string, number, boolean, null, object, or array)"
136                        .to_owned(),
137                    description: self.get_error_context(),
138                });
139                self.pos += 1;
140                None
141            }
142            None => None,
143        }
144    }
145
146    fn parse_keyword_or_identifier(&mut self, path: &str) -> Option<Value> {
147        let token = self.parse_identifier();
148
149        match token.as_str() {
150            "true" => return Some(Value::Bool(true)),
151            "false" => return Some(Value::Bool(false)),
152            "null" => return Some(Value::Null),
153            _ => {}
154        }
155
156        let lower = token.to_ascii_lowercase();
157        if lower == "yes" || lower == "y" || lower == "on" {
158            return Some(Value::Bool(true));
159        }
160        if lower == "no" || lower == "off" {
161            return Some(Value::Bool(false));
162        }
163
164        if "true".starts_with(token.as_str()) && !token.is_empty() {
165            return Some(Value::Bool(true));
166        }
167        if "false".starts_with(token.as_str()) && !token.is_empty() {
168            return Some(Value::Bool(false));
169        }
170        if "null".starts_with(token.as_str()) && token.len() >= 2 {
171            return Some(Value::Null);
172        }
173
174        if self.current_char() == Some('"') {
175            self.pos += 1;
176            self.errors.push(LlmJsonParseError {
177                path: path.to_owned(),
178                expected: "quoted string".to_owned(),
179                description: format!("missing opening quote for '{token}'"),
180            });
181            return Some(Value::String(token));
182        }
183
184        self.errors.push(LlmJsonParseError {
185            path: path.to_owned(),
186            expected: "JSON value (string, number, boolean, null, object, or array)".to_owned(),
187            description: format!("unquoted string '{token}' - did you forget quotes?"),
188        });
189        self.skip_to_recovery_point();
190        None
191    }
192
193    fn parse_object(&mut self, path: &str) -> Option<Value> {
194        self.pos += 1;
195        self.depth += 1;
196
197        let mut result = Map::new();
198        self.skip_whitespace();
199
200        while self.pos < self.chars.len() {
201            self.skip_whitespace();
202
203            if self.pos >= self.chars.len() {
204                break;
205            }
206
207            match self.current_char() {
208                Some('}') => {
209                    self.pos += 1;
210                    self.depth -= 1;
211                    return Some(Value::Object(result));
212                }
213                Some(',') => {
214                    self.pos += 1;
215                    self.skip_whitespace();
216                    continue;
217                }
218                _ => {}
219            }
220
221            let key = match self.current_char() {
222                Some('"') => self.parse_string(path),
223                Some(ch) if is_identifier_start(ch) => self.parse_identifier(),
224                _ => {
225                    self.errors.push(LlmJsonParseError {
226                        path: path.to_owned(),
227                        expected: "string key".to_owned(),
228                        description: self.get_error_context(),
229                    });
230                    self.depth -= 1;
231                    return Some(Value::Object(result));
232                }
233            };
234
235            self.skip_whitespace();
236            if self.pos >= self.chars.len() {
237                self.depth -= 1;
238                return Some(Value::Object(result));
239            }
240
241            if self.current_char() != Some(':') {
242                self.errors.push(LlmJsonParseError {
243                    path: format!("{path}.{key}"),
244                    expected: "':'".to_owned(),
245                    description: self.get_error_context(),
246                });
247                self.depth -= 1;
248                return Some(Value::Object(result));
249            }
250            self.pos += 1;
251
252            self.skip_whitespace();
253            if self.pos >= self.chars.len() {
254                self.depth -= 1;
255                return Some(Value::Object(result));
256            }
257
258            let value_path = format!("{path}.{key}");
259            let value = self.parse_value(&value_path).unwrap_or(Value::Null);
260            result.insert(key, value);
261
262            self.skip_whitespace();
263            if self.current_char() == Some(',') {
264                self.pos += 1;
265            }
266        }
267
268        self.depth -= 1;
269        Some(Value::Object(result))
270    }
271
272    fn parse_array(&mut self, path: &str) -> Option<Value> {
273        self.pos += 1;
274        self.depth += 1;
275
276        let mut result = Vec::new();
277        let mut index = 0usize;
278
279        self.skip_whitespace();
280
281        while self.pos < self.chars.len() {
282            self.skip_whitespace();
283
284            if self.pos >= self.chars.len() {
285                break;
286            }
287
288            match self.current_char() {
289                Some(']') => {
290                    self.pos += 1;
291                    self.depth -= 1;
292                    return Some(Value::Array(result));
293                }
294                Some(',') => {
295                    self.pos += 1;
296                    self.skip_whitespace();
297                    continue;
298                }
299                _ => {}
300            }
301
302            let previous_pos = self.pos;
303            let item_path = format!("{path}[{index}]");
304            let value = self.parse_value(&item_path).unwrap_or(Value::Null);
305
306            if self.pos == previous_pos && self.pos < self.chars.len() {
307                self.pos += 1;
308                continue;
309            }
310
311            result.push(value);
312            index += 1;
313
314            self.skip_whitespace();
315            if self.current_char() == Some(',') {
316                self.pos += 1;
317            }
318        }
319
320        self.depth -= 1;
321        Some(Value::Array(result))
322    }
323
324    fn parse_string(&mut self, _path: &str) -> String {
325        self.pos += 1;
326        let mut result = String::new();
327        let mut escaped = false;
328
329        while self.pos < self.chars.len() {
330            let current = self.chars[self.pos];
331
332            if escaped {
333                match current {
334                    '"' => result.push('"'),
335                    '\\' => result.push('\\'),
336                    '/' => result.push('/'),
337                    'b' => result.push('\u{0008}'),
338                    'f' => result.push('\u{000C}'),
339                    'n' => result.push('\n'),
340                    'r' => result.push('\r'),
341                    't' => result.push('\t'),
342                    'u' => {
343                        if let Some(high) = self.read_hex4(self.pos + 1) {
344                            self.pos += 4;
345
346                            if (0xd800..=0xdbff).contains(&high)
347                                && self.peek_char(1) == Some('\\')
348                                && self.peek_char(2) == Some('u')
349                                && let Some(low) = self.read_hex4(self.pos + 3)
350                                && (0xdc00..=0xdfff).contains(&low)
351                            {
352                                let high_ten = u32::from(high - 0xd800);
353                                let low_ten = u32::from(low - 0xdc00);
354                                let codepoint = 0x10000 + ((high_ten << 10) | low_ten);
355
356                                if let Some(ch) = char::from_u32(codepoint) {
357                                    result.push(ch);
358                                }
359                                self.pos += 6;
360                                escaped = false;
361                                self.pos += 1;
362                                continue;
363                            }
364
365                            if let Some(ch) = char::from_u32(u32::from(high)) {
366                                result.push(ch);
367                            } else {
368                                result.push_str(&format!("\\u{high:04x}"));
369                            }
370                        } else {
371                            let partial = self.collect_chars(self.pos + 1, 4);
372                            result.push_str("\\u");
373                            result.push_str(&partial);
374                            self.pos += partial.chars().count();
375                        }
376                    }
377                    other => result.push(other),
378                }
379
380                escaped = false;
381                self.pos += 1;
382                continue;
383            }
384
385            if current == '\\' {
386                escaped = true;
387                self.pos += 1;
388                continue;
389            }
390
391            if current == '"' {
392                self.pos += 1;
393                return result;
394            }
395
396            result.push(current);
397            self.pos += 1;
398        }
399
400        result
401    }
402
403    fn parse_number(&mut self) -> Number {
404        let start = self.pos;
405
406        if self.current_char() == Some('-') {
407            self.pos += 1;
408        }
409
410        while matches!(self.current_char(), Some(ch) if ch.is_ascii_digit()) {
411            self.pos += 1;
412        }
413
414        if self.current_char() == Some('.') {
415            self.pos += 1;
416            while matches!(self.current_char(), Some(ch) if ch.is_ascii_digit()) {
417                self.pos += 1;
418            }
419        }
420
421        if matches!(self.current_char(), Some('e') | Some('E')) {
422            self.pos += 1;
423            if matches!(self.current_char(), Some('+') | Some('-')) {
424                self.pos += 1;
425            }
426            while matches!(self.current_char(), Some(ch) if ch.is_ascii_digit()) {
427                self.pos += 1;
428            }
429        }
430
431        let literal: String = self.chars[start..self.pos].iter().collect();
432        number_from_literal(&literal)
433    }
434
435    fn parse_identifier(&mut self) -> String {
436        let start = self.pos;
437        while matches!(self.current_char(), Some(ch) if is_identifier_char(ch)) {
438            self.pos += 1;
439        }
440        self.chars[start..self.pos].iter().collect()
441    }
442
443    fn skip_to_recovery_point(&mut self) {
444        while let Some(ch) = self.current_char() {
445            if matches!(ch, ',' | '}' | ']') {
446                break;
447            }
448            self.pos += 1;
449        }
450    }
451
452    fn skip_whitespace(&mut self) {
453        loop {
454            match self.current_char() {
455                Some(ch) if ch.is_whitespace() => {
456                    self.pos += 1;
457                }
458                Some('/') if self.peek_char(1) == Some('/') => {
459                    self.pos += 2;
460                    while let Some(ch) = self.current_char() {
461                        if matches!(ch, '\n' | '\r') {
462                            break;
463                        }
464                        self.pos += 1;
465                    }
466                }
467                Some('/') if self.peek_char(1) == Some('*') => {
468                    self.pos += 2;
469                    let mut closed = false;
470                    while self.pos + 1 < self.chars.len() {
471                        if self.current_char() == Some('*') && self.peek_char(1) == Some('/') {
472                            self.pos += 2;
473                            closed = true;
474                            break;
475                        }
476                        self.pos += 1;
477                    }
478                    if !closed {
479                        self.pos = self.chars.len();
480                    }
481                }
482                _ => break,
483            }
484        }
485    }
486
487    fn get_error_context(&self) -> String {
488        let start = self.pos.saturating_sub(10);
489        let end = self.pos.saturating_add(20).min(self.chars.len());
490        let before: String = self.chars[start..self.pos].iter().collect();
491        let after: String = self.chars[self.pos..end].iter().collect();
492        let left = if start > 0 { "..." } else { "" };
493        let right = if end < self.chars.len() { "..." } else { "" };
494        format!("{left}{before}→{after}{right}")
495    }
496
497    fn read_hex4(&self, start: usize) -> Option<u16> {
498        if start + 4 > self.chars.len() {
499            return None;
500        }
501
502        let mut value = 0u16;
503        for index in start..start + 4 {
504            let digit = self.chars[index].to_digit(16)? as u16;
505            value = (value << 4) | digit;
506        }
507        Some(value)
508    }
509
510    fn collect_chars(&self, start: usize, count: usize) -> String {
511        self.chars.iter().skip(start).take(count).copied().collect()
512    }
513
514    fn current_char(&self) -> Option<char> {
515        self.chars.get(self.pos).copied()
516    }
517
518    fn peek_char(&self, offset: usize) -> Option<char> {
519        self.chars.get(self.pos + offset).copied()
520    }
521}
522
523fn number_from_literal(literal: &str) -> Number {
524    let is_float = literal
525        .as_bytes()
526        .iter()
527        .any(|byte| matches!(byte, b'.' | b'e' | b'E'));
528
529    if !is_float {
530        if let Ok(value) = literal.parse::<i64>() {
531            return Number::from(value);
532        }
533        if let Ok(value) = literal.parse::<u64>() {
534            return Number::from(value);
535        }
536    }
537
538    if let Ok(value) = literal.parse::<f64>()
539        && let Some(number) = Number::from_f64(value)
540    {
541        return number;
542    }
543
544    Number::from(0)
545}
546
547fn is_identifier_start(ch: char) -> bool {
548    ch.is_ascii_alphabetic() || matches!(ch, '_' | '$')
549}
550
551fn is_identifier_char(ch: char) -> bool {
552    ch.is_ascii_alphanumeric() || matches!(ch, '_' | '$')
553}
554
555fn extract_markdown_code_block(input: &str) -> Option<String> {
556    let code_block_start = input.find("```json")?;
557
558    if let Some(first) = input.trim_start().chars().next()
559        && matches!(first, '{' | '[' | '"')
560    {
561        return None;
562    }
563
564    let bytes = input.as_bytes();
565    let mut content_start = code_block_start + "```json".len();
566
567    while content_start < bytes.len() && !matches!(bytes[content_start], b'\n' | b'\r') {
568        content_start += 1;
569    }
570    if content_start >= bytes.len() {
571        return None;
572    }
573
574    if bytes[content_start] == b'\r' {
575        content_start += 1;
576    }
577    if content_start < bytes.len() && bytes[content_start] == b'\n' {
578        content_start += 1;
579    }
580
581    if let Some(end_offset) = input[content_start..].find("```") {
582        return Some(input[content_start..content_start + end_offset].to_owned());
583    }
584
585    Some(input[content_start..].to_owned())
586}
587
588fn find_json_start(input: &str) -> Option<usize> {
589    let bytes = input.as_bytes();
590    let mut pos = 0usize;
591
592    while pos < bytes.len() {
593        let byte = bytes[pos];
594
595        if matches!(byte, b'{' | b'[') {
596            return Some(pos);
597        }
598
599        if byte == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'/' {
600            pos += 2;
601            while pos < bytes.len() && !matches!(bytes[pos], b'\n' | b'\r') {
602                pos += 1;
603            }
604            continue;
605        }
606
607        if byte == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'*' {
608            pos += 2;
609            let mut closed = false;
610            while pos + 1 < bytes.len() {
611                if bytes[pos] == b'*' && bytes[pos + 1] == b'/' {
612                    pos += 2;
613                    closed = true;
614                    break;
615                }
616                pos += 1;
617            }
618            if !closed {
619                pos = bytes.len();
620            }
621            continue;
622        }
623
624        if byte == b'"' {
625            pos += 1;
626            while pos < bytes.len() {
627                if bytes[pos] == b'\\' {
628                    pos += 2;
629                    continue;
630                }
631                if bytes[pos] == b'"' {
632                    pos += 1;
633                    break;
634                }
635                pos += 1;
636            }
637            continue;
638        }
639
640        pos += 1;
641    }
642
643    None
644}
645
646fn skip_comments_and_whitespace(input: &str) -> &str {
647    let bytes = input.as_bytes();
648    let mut pos = 0usize;
649
650    while pos < bytes.len() {
651        let byte = bytes[pos];
652
653        if matches!(byte, b' ' | b'\t' | b'\n' | b'\r') {
654            pos += 1;
655            continue;
656        }
657
658        if byte == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'/' {
659            pos += 2;
660            while pos < bytes.len() && !matches!(bytes[pos], b'\n' | b'\r') {
661                pos += 1;
662            }
663            continue;
664        }
665
666        if byte == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'*' {
667            pos += 2;
668            let mut closed = false;
669            while pos + 1 < bytes.len() {
670                if bytes[pos] == b'*' && bytes[pos + 1] == b'/' {
671                    pos += 2;
672                    closed = true;
673                    break;
674                }
675                pos += 1;
676            }
677            if !closed {
678                pos = bytes.len();
679            }
680            continue;
681        }
682
683        break;
684    }
685
686    &input[pos..]
687}
688
689fn starts_with_primitive(input: &str) -> bool {
690    let mut chars = input.chars();
691    let Some(first) = chars.next() else {
692        return false;
693    };
694
695    if matches!(first, '"' | '-') || first.is_ascii_digit() {
696        return true;
697    }
698
699    if input.starts_with("true") || input.starts_with("false") || input.starts_with("null") {
700        return true;
701    }
702
703    if "true".starts_with(input) || "false".starts_with(input) {
704        return true;
705    }
706
707    if input.len() >= 2 && "null".starts_with(input) {
708        return true;
709    }
710
711    let lower = input.to_ascii_lowercase();
712    matches!(lower.as_str(), "yes" | "y" | "on" | "no" | "off")
713}