1use std::borrow::Cow;
2
3use serde_json::{Map, Number, Value};
4
5use crate::{LlmJsonParseError, LlmJsonParseResult};
6
7const MAX_DEPTH: usize = 512;
8
9pub fn parse_lenient_json_value(input: &str) -> LlmJsonParseResult<Value> {
10 if let Ok(data) = serde_json::from_str::<Value>(input) {
11 return LlmJsonParseResult::Success { data };
12 }
13 iterate(input)
14}
15
16fn iterate(input: &str) -> LlmJsonParseResult<Value> {
17 let source = match extract_markdown_code_block(input) {
18 Some(content) => Cow::Owned(content),
19 None => Cow::Borrowed(input),
20 };
21 let json_source = source.as_ref();
22 let trimmed = json_source.trim();
23
24 if trimmed.is_empty() {
25 return LlmJsonParseResult::Failure {
26 data: None,
27 input: input.to_owned(),
28 errors: vec![LlmJsonParseError {
29 path: "$input".to_owned(),
30 expected: "JSON value".to_owned(),
31 description: "empty input".to_owned(),
32 }],
33 };
34 }
35
36 let parse_source = if starts_with_primitive(trimmed) {
37 json_source
38 } else if let Some(json_start) = find_json_start(json_source) {
39 &json_source[json_start..]
40 } else {
41 let skipped = skip_comments_and_whitespace(json_source);
42 if skipped.is_empty() || !starts_with_primitive(skipped) {
43 return LlmJsonParseResult::Failure {
44 data: None,
45 input: input.to_owned(),
46 errors: vec![LlmJsonParseError {
47 path: "$input".to_owned(),
48 expected: "JSON value".to_owned(),
49 description: json_source.to_owned(),
50 }],
51 };
52 }
53 skipped
54 };
55
56 let mut errors = Vec::new();
57 let mut parser = LenientJsonParser::new(parse_source, &mut errors);
58 let data = parser.parse();
59
60 if errors.is_empty() {
61 if let Some(value) = data {
62 LlmJsonParseResult::Success { data: value }
63 } else {
64 LlmJsonParseResult::Failure {
65 data: None,
66 input: input.to_owned(),
67 errors: vec![LlmJsonParseError {
68 path: "$input".to_owned(),
69 expected: "JSON value".to_owned(),
70 description: "unable to parse input".to_owned(),
71 }],
72 }
73 }
74 } else {
75 LlmJsonParseResult::Failure {
76 data,
77 input: input.to_owned(),
78 errors,
79 }
80 }
81}
82
83struct LenientJsonParser<'a> {
84 chars: Vec<char>,
85 pos: usize,
86 depth: usize,
87 errors: &'a mut Vec<LlmJsonParseError>,
88}
89
90impl<'a> LenientJsonParser<'a> {
91 fn new(input: &str, errors: &'a mut Vec<LlmJsonParseError>) -> Self {
92 Self {
93 chars: input.chars().collect(),
94 pos: 0,
95 depth: 0,
96 errors,
97 }
98 }
99
100 fn parse(&mut self) -> Option<Value> {
101 self.skip_whitespace();
102 if self.pos >= self.chars.len() {
103 return None;
104 }
105 self.parse_value("$input")
106 }
107
108 fn parse_value(&mut self, path: &str) -> Option<Value> {
109 self.skip_whitespace();
110
111 if self.pos >= self.chars.len() {
112 return None;
113 }
114
115 if self.depth >= MAX_DEPTH {
116 self.errors.push(LlmJsonParseError {
117 path: path.to_owned(),
118 expected: "value (max depth exceeded)".to_owned(),
119 description: "maximum parser nesting depth exceeded".to_owned(),
120 });
121 return None;
122 }
123
124 match self.current_char() {
125 Some('{') => self.parse_object(path),
126 Some('[') => self.parse_array(path),
127 Some('"') => Some(Value::String(self.parse_string(path))),
128 Some('-') => Some(Value::Number(self.parse_number())),
129 Some(ch) if ch.is_ascii_digit() => Some(Value::Number(self.parse_number())),
130 Some(ch) if is_identifier_start(ch) => self.parse_keyword_or_identifier(path),
131 Some('}') | Some(']') | Some(',') => None,
132 Some(_) => {
133 self.errors.push(LlmJsonParseError {
134 path: path.to_owned(),
135 expected: "JSON value (string, number, boolean, null, object, or array)"
136 .to_owned(),
137 description: self.get_error_context(),
138 });
139 self.pos += 1;
140 None
141 }
142 None => None,
143 }
144 }
145
146 fn parse_keyword_or_identifier(&mut self, path: &str) -> Option<Value> {
147 let token = self.parse_identifier();
148
149 match token.as_str() {
150 "true" => return Some(Value::Bool(true)),
151 "false" => return Some(Value::Bool(false)),
152 "null" => return Some(Value::Null),
153 _ => {}
154 }
155
156 let lower = token.to_ascii_lowercase();
157 if lower == "yes" || lower == "y" || lower == "on" {
158 return Some(Value::Bool(true));
159 }
160 if lower == "no" || lower == "off" {
161 return Some(Value::Bool(false));
162 }
163
164 if "true".starts_with(token.as_str()) && !token.is_empty() {
165 return Some(Value::Bool(true));
166 }
167 if "false".starts_with(token.as_str()) && !token.is_empty() {
168 return Some(Value::Bool(false));
169 }
170 if "null".starts_with(token.as_str()) && token.len() >= 2 {
171 return Some(Value::Null);
172 }
173
174 if self.current_char() == Some('"') {
175 self.pos += 1;
176 self.errors.push(LlmJsonParseError {
177 path: path.to_owned(),
178 expected: "quoted string".to_owned(),
179 description: format!("missing opening quote for '{token}'"),
180 });
181 return Some(Value::String(token));
182 }
183
184 self.errors.push(LlmJsonParseError {
185 path: path.to_owned(),
186 expected: "JSON value (string, number, boolean, null, object, or array)".to_owned(),
187 description: format!("unquoted string '{token}' - did you forget quotes?"),
188 });
189 self.skip_to_recovery_point();
190 None
191 }
192
193 fn parse_object(&mut self, path: &str) -> Option<Value> {
194 self.pos += 1;
195 self.depth += 1;
196
197 let mut result = Map::new();
198 self.skip_whitespace();
199
200 while self.pos < self.chars.len() {
201 self.skip_whitespace();
202
203 if self.pos >= self.chars.len() {
204 break;
205 }
206
207 match self.current_char() {
208 Some('}') => {
209 self.pos += 1;
210 self.depth -= 1;
211 return Some(Value::Object(result));
212 }
213 Some(',') => {
214 self.pos += 1;
215 self.skip_whitespace();
216 continue;
217 }
218 _ => {}
219 }
220
221 let key = match self.current_char() {
222 Some('"') => self.parse_string(path),
223 Some(ch) if is_identifier_start(ch) => self.parse_identifier(),
224 _ => {
225 self.errors.push(LlmJsonParseError {
226 path: path.to_owned(),
227 expected: "string key".to_owned(),
228 description: self.get_error_context(),
229 });
230 self.depth -= 1;
231 return Some(Value::Object(result));
232 }
233 };
234
235 self.skip_whitespace();
236 if self.pos >= self.chars.len() {
237 self.depth -= 1;
238 return Some(Value::Object(result));
239 }
240
241 if self.current_char() != Some(':') {
242 self.errors.push(LlmJsonParseError {
243 path: format!("{path}.{key}"),
244 expected: "':'".to_owned(),
245 description: self.get_error_context(),
246 });
247 self.depth -= 1;
248 return Some(Value::Object(result));
249 }
250 self.pos += 1;
251
252 self.skip_whitespace();
253 if self.pos >= self.chars.len() {
254 self.depth -= 1;
255 return Some(Value::Object(result));
256 }
257
258 let value_path = format!("{path}.{key}");
259 let value = self.parse_value(&value_path).unwrap_or(Value::Null);
260 result.insert(key, value);
261
262 self.skip_whitespace();
263 if self.current_char() == Some(',') {
264 self.pos += 1;
265 }
266 }
267
268 self.depth -= 1;
269 Some(Value::Object(result))
270 }
271
272 fn parse_array(&mut self, path: &str) -> Option<Value> {
273 self.pos += 1;
274 self.depth += 1;
275
276 let mut result = Vec::new();
277 let mut index = 0usize;
278
279 self.skip_whitespace();
280
281 while self.pos < self.chars.len() {
282 self.skip_whitespace();
283
284 if self.pos >= self.chars.len() {
285 break;
286 }
287
288 match self.current_char() {
289 Some(']') => {
290 self.pos += 1;
291 self.depth -= 1;
292 return Some(Value::Array(result));
293 }
294 Some(',') => {
295 self.pos += 1;
296 self.skip_whitespace();
297 continue;
298 }
299 _ => {}
300 }
301
302 let previous_pos = self.pos;
303 let item_path = format!("{path}[{index}]");
304 let value = self.parse_value(&item_path).unwrap_or(Value::Null);
305
306 if self.pos == previous_pos && self.pos < self.chars.len() {
307 self.pos += 1;
308 continue;
309 }
310
311 result.push(value);
312 index += 1;
313
314 self.skip_whitespace();
315 if self.current_char() == Some(',') {
316 self.pos += 1;
317 }
318 }
319
320 self.depth -= 1;
321 Some(Value::Array(result))
322 }
323
324 fn parse_string(&mut self, _path: &str) -> String {
325 self.pos += 1;
326 let mut result = String::new();
327 let mut escaped = false;
328
329 while self.pos < self.chars.len() {
330 let current = self.chars[self.pos];
331
332 if escaped {
333 match current {
334 '"' => result.push('"'),
335 '\\' => result.push('\\'),
336 '/' => result.push('/'),
337 'b' => result.push('\u{0008}'),
338 'f' => result.push('\u{000C}'),
339 'n' => result.push('\n'),
340 'r' => result.push('\r'),
341 't' => result.push('\t'),
342 'u' => {
343 if let Some(high) = self.read_hex4(self.pos + 1) {
344 self.pos += 4;
345
346 if (0xd800..=0xdbff).contains(&high)
347 && self.peek_char(1) == Some('\\')
348 && self.peek_char(2) == Some('u')
349 && let Some(low) = self.read_hex4(self.pos + 3)
350 && (0xdc00..=0xdfff).contains(&low)
351 {
352 let high_ten = u32::from(high - 0xd800);
353 let low_ten = u32::from(low - 0xdc00);
354 let codepoint = 0x10000 + ((high_ten << 10) | low_ten);
355
356 if let Some(ch) = char::from_u32(codepoint) {
357 result.push(ch);
358 }
359 self.pos += 6;
360 escaped = false;
361 self.pos += 1;
362 continue;
363 }
364
365 if let Some(ch) = char::from_u32(u32::from(high)) {
366 result.push(ch);
367 } else {
368 result.push_str(&format!("\\u{high:04x}"));
369 }
370 } else {
371 let partial = self.collect_chars(self.pos + 1, 4);
372 result.push_str("\\u");
373 result.push_str(&partial);
374 self.pos += partial.chars().count();
375 }
376 }
377 other => result.push(other),
378 }
379
380 escaped = false;
381 self.pos += 1;
382 continue;
383 }
384
385 if current == '\\' {
386 escaped = true;
387 self.pos += 1;
388 continue;
389 }
390
391 if current == '"' {
392 self.pos += 1;
393 return result;
394 }
395
396 result.push(current);
397 self.pos += 1;
398 }
399
400 result
401 }
402
403 fn parse_number(&mut self) -> Number {
404 let start = self.pos;
405
406 if self.current_char() == Some('-') {
407 self.pos += 1;
408 }
409
410 while matches!(self.current_char(), Some(ch) if ch.is_ascii_digit()) {
411 self.pos += 1;
412 }
413
414 if self.current_char() == Some('.') {
415 self.pos += 1;
416 while matches!(self.current_char(), Some(ch) if ch.is_ascii_digit()) {
417 self.pos += 1;
418 }
419 }
420
421 if matches!(self.current_char(), Some('e') | Some('E')) {
422 self.pos += 1;
423 if matches!(self.current_char(), Some('+') | Some('-')) {
424 self.pos += 1;
425 }
426 while matches!(self.current_char(), Some(ch) if ch.is_ascii_digit()) {
427 self.pos += 1;
428 }
429 }
430
431 let literal: String = self.chars[start..self.pos].iter().collect();
432 number_from_literal(&literal)
433 }
434
435 fn parse_identifier(&mut self) -> String {
436 let start = self.pos;
437 while matches!(self.current_char(), Some(ch) if is_identifier_char(ch)) {
438 self.pos += 1;
439 }
440 self.chars[start..self.pos].iter().collect()
441 }
442
443 fn skip_to_recovery_point(&mut self) {
444 while let Some(ch) = self.current_char() {
445 if matches!(ch, ',' | '}' | ']') {
446 break;
447 }
448 self.pos += 1;
449 }
450 }
451
452 fn skip_whitespace(&mut self) {
453 loop {
454 match self.current_char() {
455 Some(ch) if ch.is_whitespace() => {
456 self.pos += 1;
457 }
458 Some('/') if self.peek_char(1) == Some('/') => {
459 self.pos += 2;
460 while let Some(ch) = self.current_char() {
461 if matches!(ch, '\n' | '\r') {
462 break;
463 }
464 self.pos += 1;
465 }
466 }
467 Some('/') if self.peek_char(1) == Some('*') => {
468 self.pos += 2;
469 let mut closed = false;
470 while self.pos + 1 < self.chars.len() {
471 if self.current_char() == Some('*') && self.peek_char(1) == Some('/') {
472 self.pos += 2;
473 closed = true;
474 break;
475 }
476 self.pos += 1;
477 }
478 if !closed {
479 self.pos = self.chars.len();
480 }
481 }
482 _ => break,
483 }
484 }
485 }
486
487 fn get_error_context(&self) -> String {
488 let start = self.pos.saturating_sub(10);
489 let end = self.pos.saturating_add(20).min(self.chars.len());
490 let before: String = self.chars[start..self.pos].iter().collect();
491 let after: String = self.chars[self.pos..end].iter().collect();
492 let left = if start > 0 { "..." } else { "" };
493 let right = if end < self.chars.len() { "..." } else { "" };
494 format!("{left}{before}→{after}{right}")
495 }
496
497 fn read_hex4(&self, start: usize) -> Option<u16> {
498 if start + 4 > self.chars.len() {
499 return None;
500 }
501
502 let mut value = 0u16;
503 for index in start..start + 4 {
504 let digit = self.chars[index].to_digit(16)? as u16;
505 value = (value << 4) | digit;
506 }
507 Some(value)
508 }
509
510 fn collect_chars(&self, start: usize, count: usize) -> String {
511 self.chars.iter().skip(start).take(count).copied().collect()
512 }
513
514 fn current_char(&self) -> Option<char> {
515 self.chars.get(self.pos).copied()
516 }
517
518 fn peek_char(&self, offset: usize) -> Option<char> {
519 self.chars.get(self.pos + offset).copied()
520 }
521}
522
523fn number_from_literal(literal: &str) -> Number {
524 let is_float = literal
525 .as_bytes()
526 .iter()
527 .any(|byte| matches!(byte, b'.' | b'e' | b'E'));
528
529 if !is_float {
530 if let Ok(value) = literal.parse::<i64>() {
531 return Number::from(value);
532 }
533 if let Ok(value) = literal.parse::<u64>() {
534 return Number::from(value);
535 }
536 }
537
538 if let Ok(value) = literal.parse::<f64>()
539 && let Some(number) = Number::from_f64(value)
540 {
541 return number;
542 }
543
544 Number::from(0)
545}
546
547fn is_identifier_start(ch: char) -> bool {
548 ch.is_ascii_alphabetic() || matches!(ch, '_' | '$')
549}
550
551fn is_identifier_char(ch: char) -> bool {
552 ch.is_ascii_alphanumeric() || matches!(ch, '_' | '$')
553}
554
555fn extract_markdown_code_block(input: &str) -> Option<String> {
556 let code_block_start = input.find("```json")?;
557
558 if let Some(first) = input.trim_start().chars().next()
559 && matches!(first, '{' | '[' | '"')
560 {
561 return None;
562 }
563
564 let bytes = input.as_bytes();
565 let mut content_start = code_block_start + "```json".len();
566
567 while content_start < bytes.len() && !matches!(bytes[content_start], b'\n' | b'\r') {
568 content_start += 1;
569 }
570 if content_start >= bytes.len() {
571 return None;
572 }
573
574 if bytes[content_start] == b'\r' {
575 content_start += 1;
576 }
577 if content_start < bytes.len() && bytes[content_start] == b'\n' {
578 content_start += 1;
579 }
580
581 if let Some(end_offset) = input[content_start..].find("```") {
582 return Some(input[content_start..content_start + end_offset].to_owned());
583 }
584
585 Some(input[content_start..].to_owned())
586}
587
588fn find_json_start(input: &str) -> Option<usize> {
589 let bytes = input.as_bytes();
590 let mut pos = 0usize;
591
592 while pos < bytes.len() {
593 let byte = bytes[pos];
594
595 if matches!(byte, b'{' | b'[') {
596 return Some(pos);
597 }
598
599 if byte == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'/' {
600 pos += 2;
601 while pos < bytes.len() && !matches!(bytes[pos], b'\n' | b'\r') {
602 pos += 1;
603 }
604 continue;
605 }
606
607 if byte == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'*' {
608 pos += 2;
609 let mut closed = false;
610 while pos + 1 < bytes.len() {
611 if bytes[pos] == b'*' && bytes[pos + 1] == b'/' {
612 pos += 2;
613 closed = true;
614 break;
615 }
616 pos += 1;
617 }
618 if !closed {
619 pos = bytes.len();
620 }
621 continue;
622 }
623
624 if byte == b'"' {
625 pos += 1;
626 while pos < bytes.len() {
627 if bytes[pos] == b'\\' {
628 pos += 2;
629 continue;
630 }
631 if bytes[pos] == b'"' {
632 pos += 1;
633 break;
634 }
635 pos += 1;
636 }
637 continue;
638 }
639
640 pos += 1;
641 }
642
643 None
644}
645
646fn skip_comments_and_whitespace(input: &str) -> &str {
647 let bytes = input.as_bytes();
648 let mut pos = 0usize;
649
650 while pos < bytes.len() {
651 let byte = bytes[pos];
652
653 if matches!(byte, b' ' | b'\t' | b'\n' | b'\r') {
654 pos += 1;
655 continue;
656 }
657
658 if byte == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'/' {
659 pos += 2;
660 while pos < bytes.len() && !matches!(bytes[pos], b'\n' | b'\r') {
661 pos += 1;
662 }
663 continue;
664 }
665
666 if byte == b'/' && pos + 1 < bytes.len() && bytes[pos + 1] == b'*' {
667 pos += 2;
668 let mut closed = false;
669 while pos + 1 < bytes.len() {
670 if bytes[pos] == b'*' && bytes[pos + 1] == b'/' {
671 pos += 2;
672 closed = true;
673 break;
674 }
675 pos += 1;
676 }
677 if !closed {
678 pos = bytes.len();
679 }
680 continue;
681 }
682
683 break;
684 }
685
686 &input[pos..]
687}
688
689fn starts_with_primitive(input: &str) -> bool {
690 let mut chars = input.chars();
691 let Some(first) = chars.next() else {
692 return false;
693 };
694
695 if matches!(first, '"' | '-') || first.is_ascii_digit() {
696 return true;
697 }
698
699 if input.starts_with("true") || input.starts_with("false") || input.starts_with("null") {
700 return true;
701 }
702
703 if "true".starts_with(input) || "false".starts_with(input) {
704 return true;
705 }
706
707 if input.len() >= 2 && "null".starts_with(input) {
708 return true;
709 }
710
711 let lower = input.to_ascii_lowercase();
712 matches!(lower.as_str(), "yes" | "y" | "on" | "no" | "off")
713}