Skip to main content

simple_agents_healing/
parser.rs

1//! JSON-ish parser for handling malformed LLM outputs.
2//!
3//! Implements a three-phase parsing strategy:
4//!
5//! 1. **Strip & Fix**: Remove markdown, fix trailing commas, normalize quotes
6//! 2. **Standard Parse**: Try `serde_json` (fast path)
7//! 3. **Lenient Parse**: Character-by-character state machine for incomplete/malformed JSON
8//!
9//! # Example
10//!
11//! ```
12//! use simple_agents_healing::parser::JsonishParser;
13//!
14//! let parser = JsonishParser::new();
15//!
16//! // Parse markdown-wrapped JSON
17//! let malformed = r#"```json
18//! {"key": "value", "num": 42,}
19//! ```"#;
20//!
21//! let result = parser.parse(malformed).unwrap();
22//! assert_eq!(result.value["key"], "value");
23//! assert_eq!(result.value["num"], 42);
24//! ```
25
26use serde_json::Value;
27use simple_agent_type::coercion::{CoercionFlag, CoercionResult};
28use simple_agent_type::error::{HealingError, Result};
29use std::fmt;
30use tracing::{debug, trace, warn};
31
32/// Parser configuration options.
33#[derive(Debug, Clone)]
34pub struct ParserConfig {
35    /// Enable markdown stripping (```json ... ```)
36    pub strip_markdown: bool,
37    /// Enable trailing comma fixes
38    pub fix_trailing_commas: bool,
39    /// Enable quote normalization (single → double)
40    pub fix_quotes: bool,
41    /// Enable unquoted key fixes ({key: "value"} → {"key": "value"})
42    pub fix_unquoted_keys: bool,
43    /// Enable control character fixes
44    pub fix_control_chars: bool,
45    /// Enable BOM removal
46    pub remove_bom: bool,
47    /// Enable lenient parsing (state machine for incomplete JSON)
48    pub allow_lenient_parsing: bool,
49    /// Minimum confidence threshold (0.0-1.0)
50    pub min_confidence: f32,
51}
52
53impl Default for ParserConfig {
54    fn default() -> Self {
55        Self {
56            strip_markdown: true,
57            fix_trailing_commas: true,
58            fix_quotes: true,
59            fix_unquoted_keys: true,
60            fix_control_chars: true,
61            remove_bom: true,
62            allow_lenient_parsing: true,
63            min_confidence: 0.5,
64        }
65    }
66}
67
68/// Result of parsing with coercion tracking.
69pub type ParserResult = CoercionResult<Value>;
70
71/// Three-phase JSON parser for malformed LLM outputs.
72///
73/// # Phases
74///
75/// 1. **Strip & Fix**: Quick string transformations
76/// 2. **Standard Parse**: Attempt `serde_json::from_str` (fast path)
77/// 3. **Lenient Parse**: State machine for incomplete/malformed JSON
78///
79/// # Example
80///
81/// ```
82/// use simple_agents_healing::parser::JsonishParser;
83///
84/// let parser = JsonishParser::new();
85/// let result = parser.parse(r#"{"key": "value",}"#).unwrap();
86/// assert!(result.flags.iter().any(|f| matches!(f,
87///     simple_agent_type::coercion::CoercionFlag::FixedTrailingComma)));
88/// ```
89pub struct JsonishParser {
90    config: ParserConfig,
91}
92
93impl JsonishParser {
94    /// Create a new parser with default configuration.
95    pub fn new() -> Self {
96        Self {
97            config: ParserConfig::default(),
98        }
99    }
100
101    /// Create a parser with custom configuration.
102    pub fn with_config(config: ParserConfig) -> Self {
103        Self { config }
104    }
105
106    /// Parse potentially malformed JSON.
107    ///
108    /// Returns a [`ParserResult`] containing the parsed value, flags indicating
109    /// transformations applied, and a confidence score.
110    ///
111    /// # Errors
112    ///
113    /// Returns [`HealingError::ParseFailed`] if all parsing phases fail.
114    /// Returns [`HealingError::LowConfidence`] if confidence is below threshold.
115    ///
116    /// # Example
117    ///
118    /// ```
119    /// use simple_agents_healing::parser::JsonishParser;
120    ///
121    /// let parser = JsonishParser::new();
122    ///
123    /// // Perfect JSON - no healing needed
124    /// let result = parser.parse(r#"{"key": "value"}"#).unwrap();
125    /// assert_eq!(result.confidence, 1.0);
126    /// assert!(result.flags.is_empty());
127    ///
128    /// // Malformed JSON - healing applied
129    /// let result = parser.parse(r#"{"key": "value",}"#).unwrap();
130    /// assert!(result.confidence < 1.0);
131    /// assert!(!result.flags.is_empty());
132    /// ```
133    pub fn parse(&self, input: &str) -> Result<ParserResult> {
134        trace!("Starting JSON parse: {} bytes", input.len());
135
136        let mut flags = Vec::new();
137        let mut confidence = 1.0;
138
139        // Phase 1: Strip & Fix
140        let cleaned = self.strip_and_fix(input, &mut flags, &mut confidence)?;
141
142        // Phase 2: Try standard parsing
143        if let Ok(value) = serde_json::from_str::<Value>(&cleaned) {
144            debug!("Standard parse succeeded with {} flags", flags.len());
145            let result = CoercionResult {
146                value,
147                flags,
148                confidence,
149            };
150
151            return self.check_confidence(result);
152        }
153
154        // Phase 3: Lenient parsing (if enabled)
155        if self.config.allow_lenient_parsing {
156            warn!("Standard parse failed, attempting lenient parse");
157            let value = self.lenient_parse(&cleaned, &mut flags, &mut confidence)?;
158
159            let result = CoercionResult {
160                value,
161                flags,
162                confidence,
163            };
164
165            return self.check_confidence(result);
166        }
167
168        // All parsing failed
169        Err(HealingError::ParseFailed {
170            error_message: "Could not parse JSON".to_string(),
171            input: input.to_string(),
172        }
173        .into())
174    }
175
176    /// Phase 1: Strip markdown and fix common issues.
177    fn strip_and_fix(
178        &self,
179        input: &str,
180        flags: &mut Vec<CoercionFlag>,
181        confidence: &mut f32,
182    ) -> Result<String> {
183        let mut output = input.to_string();
184
185        // Remove BOM
186        if self.config.remove_bom && output.starts_with('\u{FEFF}') {
187            output = output.trim_start_matches('\u{FEFF}').to_string();
188            flags.push(CoercionFlag::RemovedBom);
189            *confidence *= 0.99; // Minimal impact
190        }
191
192        // Strip markdown code blocks
193        if self.config.strip_markdown {
194            if let Some(stripped) = self.strip_markdown(&output) {
195                output = stripped;
196                flags.push(CoercionFlag::StrippedMarkdown);
197                *confidence *= 0.95;
198            }
199        }
200
201        // Fix trailing commas
202        if self.config.fix_trailing_commas && (output.contains(",}") || output.contains(",]")) {
203            output = output.replace(",}", "}").replace(",]", "]");
204            flags.push(CoercionFlag::FixedTrailingComma);
205            *confidence *= 0.95;
206        }
207
208        // Fix single quotes (only if no double quotes present)
209        if self.config.fix_quotes && output.contains('\'') && !output.contains('"') {
210            output = output.replace('\'', "\"");
211            flags.push(CoercionFlag::FixedQuotes);
212            *confidence *= 0.90;
213        }
214
215        // Fix control characters
216        if self.config.fix_control_chars {
217            let original_len = output.len();
218            output = output
219                .chars()
220                .filter(|c| !c.is_control() || c.is_whitespace())
221                .collect();
222            if output.len() != original_len {
223                flags.push(CoercionFlag::FixedControlCharacters);
224                *confidence *= 0.90;
225            }
226        }
227
228        // Fix unquoted keys (basic)
229        if self.config.fix_unquoted_keys {
230            // This is a simplified version - full implementation would need a parser
231            if let Some(fixed) = self.fix_unquoted_keys_simple(&output) {
232                output = fixed;
233                flags.push(CoercionFlag::FixedUnquotedKeys);
234                *confidence *= 0.85;
235            }
236        }
237
238        Ok(output)
239    }
240
241    /// Strip markdown code fences.
242    fn strip_markdown(&self, input: &str) -> Option<String> {
243        let trimmed = input.trim();
244
245        // Check for ```json ... ``` or ``` ... ```
246        if trimmed.starts_with("```") {
247            let lines: Vec<&str> = trimmed.lines().collect();
248            if lines.len() >= 2 {
249                // Remove first line (```json or ```) and last line (```)
250                let start = 1; // Always skip first line (```json or ```)
251
252                let end = if lines.last().map(|l| l.trim()) == Some("```") {
253                    lines.len() - 1
254                } else {
255                    lines.len()
256                };
257
258                if end > start {
259                    let content = lines[start..end].join("\n");
260                    return Some(content);
261                }
262            }
263        }
264
265        None
266    }
267
268    /// Simple unquoted key fixer (handles common cases).
269    fn fix_unquoted_keys_simple(&self, _input: &str) -> Option<String> {
270        // This is a simplified implementation
271        // Full version would need proper parsing
272        // For now, we skip this fix if regex feature is not enabled
273        #[cfg(feature = "regex-support")]
274        {
275            let pattern = regex::Regex::new(r"([{,]\s*)([a-zA-Z_][a-zA-Z0-9_]*)\s*:").ok()?;
276
277            if pattern.is_match(_input) {
278                let result = pattern.replace_all(_input, r#"$1"$2":"#).to_string();
279                return Some(result);
280            }
281        }
282
283        None
284    }
285}
286
287/// State machine for lenient JSON parsing.
288///
289/// Tracks parsing state and handles incomplete/malformed JSON structures.
290#[derive(Debug)]
291struct LenientParserState {
292    /// Stack of nested collections (objects/arrays)
293    stack: Vec<CollectionState>,
294    /// Current state
295    state: ParseState,
296    /// Completed values (for multiple top-level values)
297    completed: Vec<Value>,
298    /// Flags to track transformations
299    flags: Vec<CoercionFlag>,
300    /// Current string being built
301    current_string: String,
302    /// Current number being built
303    current_number: String,
304    /// Current key in object
305    current_key: Option<String>,
306    /// Whether we're escaping the next character
307    is_escaped: bool,
308}
309
310/// Parser state for the state machine.
311#[derive(Debug, Clone, PartialEq)]
312enum ParseState {
313    /// Expecting a value (start of parsing or after comma)
314    ExpectValue,
315    /// Inside a string (with delimiter type)
316    InString(StringDelimiter),
317    /// Inside a number
318    InNumber,
319    /// Expecting object key or }
320    ExpectKey,
321    /// Expecting : after key
322    ExpectColon,
323    /// After value in object, expecting , or }
324    AfterValue,
325    /// After value in array, expecting , or ]
326    AfterArrayValue,
327    /// Inside a line comment (//)
328    InLineComment,
329    /// Inside a block comment (/* */)
330    InBlockComment,
331}
332
333/// String delimiter types.
334#[derive(Debug, Clone, Copy, PartialEq)]
335enum StringDelimiter {
336    /// Double quotes: "..."
337    Double,
338    /// Single quotes: '...'
339    Single,
340    /// Triple double quotes: """..."""
341    TripleDouble,
342    /// Triple single quotes: '''...'''
343    TripleSingle,
344    /// Backtick: `...`
345    Backtick,
346    /// Unquoted (for keys): key
347    Unquoted,
348}
349
350/// Collection being built (object or array).
351#[derive(Debug)]
352enum CollectionState {
353    /// Object: keys and values
354    Object {
355        keys: Vec<String>,
356        values: Vec<Value>,
357    },
358    /// Array: values only
359    Array { values: Vec<Value> },
360}
361
362impl LenientParserState {
363    fn new() -> Self {
364        Self {
365            stack: Vec::new(),
366            state: ParseState::ExpectValue,
367            completed: Vec::new(),
368            flags: Vec::new(),
369            current_string: String::new(),
370            current_number: String::new(),
371            current_key: None,
372            is_escaped: false,
373        }
374    }
375
376    /// Process a single character.
377    ///
378    /// Returns the number of characters to advance (usually 1, but may be more for multi-char tokens).
379    fn process_char(&mut self, ch: char, next: Option<char>, next2: Option<char>) -> Result<usize> {
380        // Handle escape sequences in strings
381        if self.is_escaped {
382            self.current_string.push(ch);
383            self.is_escaped = false;
384            return Ok(1);
385        }
386
387        match &self.state {
388            ParseState::ExpectValue => self.handle_expect_value(ch, next, next2),
389            ParseState::InString(delim) => self.handle_in_string(ch, next, next2, *delim),
390            ParseState::InNumber => self.handle_in_number(ch),
391            ParseState::ExpectKey => self.handle_expect_key(ch, next, next2),
392            ParseState::ExpectColon => self.handle_expect_colon(ch),
393            ParseState::AfterValue => self.handle_after_value(ch, next),
394            ParseState::AfterArrayValue => self.handle_after_array_value(ch, next),
395            ParseState::InLineComment => self.handle_line_comment(ch),
396            ParseState::InBlockComment => self.handle_block_comment(ch, next),
397        }
398    }
399
400    fn handle_expect_value(
401        &mut self,
402        ch: char,
403        next: Option<char>,
404        next2: Option<char>,
405    ) -> Result<usize> {
406        match ch {
407            // Whitespace - skip
408            ' ' | '\t' | '\n' | '\r' => Ok(1),
409
410            // Start object
411            '{' => {
412                self.stack.push(CollectionState::Object {
413                    keys: Vec::new(),
414                    values: Vec::new(),
415                });
416                self.state = ParseState::ExpectKey;
417                Ok(1)
418            }
419
420            // Start array
421            '[' => {
422                self.stack
423                    .push(CollectionState::Array { values: Vec::new() });
424                Ok(1)
425            }
426
427            // Triple quote strings
428            '"' if next == Some('"') && next2 == Some('"') => {
429                self.state = ParseState::InString(StringDelimiter::TripleDouble);
430                self.current_string.clear();
431                Ok(3)
432            }
433            '\'' if next == Some('\'') && next2 == Some('\'') => {
434                self.state = ParseState::InString(StringDelimiter::TripleSingle);
435                self.current_string.clear();
436                Ok(3)
437            }
438
439            // Regular strings
440            '"' => {
441                self.state = ParseState::InString(StringDelimiter::Double);
442                self.current_string.clear();
443                Ok(1)
444            }
445            '\'' => {
446                self.state = ParseState::InString(StringDelimiter::Single);
447                self.current_string.clear();
448                Ok(1)
449            }
450            '`' => {
451                self.state = ParseState::InString(StringDelimiter::Backtick);
452                self.current_string.clear();
453                Ok(1)
454            }
455
456            // Comments
457            '/' if next == Some('/') => {
458                self.state = ParseState::InLineComment;
459                Ok(2)
460            }
461            '/' if next == Some('*') => {
462                self.state = ParseState::InBlockComment;
463                Ok(2)
464            }
465
466            // Numbers
467            '-' | '0'..='9' => {
468                self.state = ParseState::InNumber;
469                self.current_number.clear();
470                self.current_number.push(ch);
471                Ok(1)
472            }
473
474            // Boolean/null literals
475            't' | 'f' | 'n' => self.handle_literal(ch),
476
477            // Unexpected character
478            _ => {
479                trace!(
480                    "Skipping unexpected character '{}' when expecting value",
481                    ch
482                );
483                Ok(1) // Skip it
484            }
485        }
486    }
487
488    fn handle_in_string(
489        &mut self,
490        ch: char,
491        next: Option<char>,
492        next2: Option<char>,
493        delim: StringDelimiter,
494    ) -> Result<usize> {
495        match (ch, delim) {
496            // Escape sequence
497            ('\\', _) => {
498                self.is_escaped = true;
499                self.current_string.push(ch);
500                Ok(1)
501            }
502
503            // End of triple quote string
504            ('"', StringDelimiter::TripleDouble) if next == Some('"') && next2 == Some('"') => {
505                self.finish_string();
506                Ok(3)
507            }
508            ('\'', StringDelimiter::TripleSingle) if next == Some('\'') && next2 == Some('\'') => {
509                self.finish_string();
510                Ok(3)
511            }
512
513            // End of regular string
514            ('"', StringDelimiter::Double)
515            | ('\'', StringDelimiter::Single)
516            | ('`', StringDelimiter::Backtick) => {
517                self.finish_string();
518                Ok(1)
519            }
520
521            // End unquoted string (whitespace, comma, colon, brace, bracket)
522            (c, StringDelimiter::Unquoted)
523                if c.is_whitespace() || matches!(c, ',' | ':' | '}' | ']') =>
524            {
525                self.finish_string();
526                Ok(0) // Don't consume this character
527            }
528
529            // Regular character in string
530            _ => {
531                self.current_string.push(ch);
532                Ok(1)
533            }
534        }
535    }
536
537    fn finish_string(&mut self) {
538        // Check if this is a boolean or null literal (from unquoted strings)
539        let value = match self.current_string.as_str() {
540            "true" => Value::Bool(true),
541            "false" => Value::Bool(false),
542            "null" => Value::Null,
543            s => Value::String(s.to_string()),
544        };
545
546        self.push_value(value);
547        self.current_string.clear();
548    }
549
550    fn handle_in_number(&mut self, ch: char) -> Result<usize> {
551        match ch {
552            // Valid number characters
553            '0'..='9' | '.' | 'e' | 'E' | '+' | '-' => {
554                self.current_number.push(ch);
555                Ok(1)
556            }
557
558            // End of number
559            _ => {
560                self.finish_number()?;
561                Ok(0) // Don't consume this character
562            }
563        }
564    }
565
566    fn finish_number(&mut self) -> Result<()> {
567        let num_str = self.current_number.trim();
568
569        // Try parsing as integer first
570        if let Ok(i) = num_str.parse::<i64>() {
571            self.push_value(Value::Number(serde_json::Number::from(i)));
572        }
573        // Try as float
574        else if let Ok(f) = num_str.parse::<f64>() {
575            if let Some(num) = serde_json::Number::from_f64(f) {
576                self.push_value(Value::Number(num));
577            } else {
578                // Invalid float
579                self.push_value(Value::String(num_str.to_string()));
580            }
581        }
582        // Not a valid number - treat as string
583        else {
584            self.push_value(Value::String(num_str.to_string()));
585        }
586
587        self.current_number.clear();
588        Ok(())
589    }
590
591    fn handle_literal(&mut self, ch: char) -> Result<usize> {
592        // Only accept true, false, null as literals
593        // Don't accept arbitrary unquoted strings at top level
594        match ch {
595            't' => {
596                self.state = ParseState::InString(StringDelimiter::Unquoted);
597                self.current_string.clear();
598                self.current_string.push(ch);
599                Ok(1)
600            }
601            'f' => {
602                self.state = ParseState::InString(StringDelimiter::Unquoted);
603                self.current_string.clear();
604                self.current_string.push(ch);
605                Ok(1)
606            }
607            'n' => {
608                self.state = ParseState::InString(StringDelimiter::Unquoted);
609                self.current_string.clear();
610                self.current_string.push(ch);
611                Ok(1)
612            }
613            _ => Ok(1), // Skip unknown characters
614        }
615    }
616
617    fn handle_expect_key(
618        &mut self,
619        ch: char,
620        next: Option<char>,
621        _next2: Option<char>,
622    ) -> Result<usize> {
623        match ch {
624            // Whitespace - skip
625            ' ' | '\t' | '\n' | '\r' => Ok(1),
626
627            // Empty object
628            '}' => {
629                self.finish_collection()?;
630                Ok(1)
631            }
632
633            // Comments
634            '/' if next == Some('/') => {
635                self.state = ParseState::InLineComment;
636                Ok(2)
637            }
638            '/' if next == Some('*') => {
639                self.state = ParseState::InBlockComment;
640                Ok(2)
641            }
642
643            // Quoted key
644            '"' => {
645                self.state = ParseState::InString(StringDelimiter::Double);
646                self.current_string.clear();
647                Ok(1)
648            }
649            '\'' => {
650                self.state = ParseState::InString(StringDelimiter::Single);
651                self.current_string.clear();
652                Ok(1)
653            }
654
655            // Unquoted key
656            'a'..='z' | 'A'..='Z' | '_' => {
657                self.flags.push(CoercionFlag::FixedUnquotedKeys);
658                self.state = ParseState::InString(StringDelimiter::Unquoted);
659                self.current_string.clear();
660                self.current_string.push(ch);
661                Ok(1)
662            }
663
664            _ => {
665                warn!("Unexpected character '{}' when expecting object key", ch);
666                Ok(1)
667            }
668        }
669    }
670
671    fn handle_expect_colon(&mut self, ch: char) -> Result<usize> {
672        match ch {
673            ' ' | '\t' | '\n' | '\r' => Ok(1),
674            ':' => {
675                self.state = ParseState::ExpectValue;
676                Ok(1)
677            }
678            _ => {
679                warn!("Expected ':' but got '{}'", ch);
680                // Be lenient - assume missing colon
681                self.state = ParseState::ExpectValue;
682                Ok(0)
683            }
684        }
685    }
686
687    fn handle_after_value(&mut self, ch: char, next: Option<char>) -> Result<usize> {
688        match ch {
689            ' ' | '\t' | '\n' | '\r' => Ok(1),
690            ',' => {
691                self.state = ParseState::ExpectKey;
692                Ok(1)
693            }
694            '}' => {
695                self.finish_collection()?;
696                Ok(1)
697            }
698            '/' if next == Some('/') => {
699                self.state = ParseState::InLineComment;
700                Ok(2)
701            }
702            '/' if next == Some('*') => {
703                self.state = ParseState::InBlockComment;
704                Ok(2)
705            }
706            _ => {
707                warn!("Expected ',' or '}}' but got '{}'", ch);
708                Ok(1)
709            }
710        }
711    }
712
713    fn handle_after_array_value(&mut self, ch: char, next: Option<char>) -> Result<usize> {
714        match ch {
715            ' ' | '\t' | '\n' | '\r' => Ok(1),
716            ',' => {
717                self.state = ParseState::ExpectValue;
718                Ok(1)
719            }
720            ']' => {
721                self.finish_collection()?;
722                Ok(1)
723            }
724            '/' if next == Some('/') => {
725                self.state = ParseState::InLineComment;
726                Ok(2)
727            }
728            '/' if next == Some('*') => {
729                self.state = ParseState::InBlockComment;
730                Ok(2)
731            }
732            _ => {
733                warn!("Expected ',' or ']' but got '{}'", ch);
734                Ok(1)
735            }
736        }
737    }
738
739    fn handle_line_comment(&mut self, ch: char) -> Result<usize> {
740        if ch == '\n' {
741            // End of line comment - return to previous state
742            self.state = ParseState::ExpectValue;
743        }
744        Ok(1)
745    }
746
747    fn handle_block_comment(&mut self, ch: char, next: Option<char>) -> Result<usize> {
748        if ch == '*' && next == Some('/') {
749            // End of block comment
750            self.state = ParseState::ExpectValue;
751            Ok(2)
752        } else {
753            Ok(1)
754        }
755    }
756
757    fn push_value(&mut self, value: Value) {
758        match self.stack.last_mut() {
759            Some(CollectionState::Object { keys, values }) => {
760                if let Some(key) = self.current_key.take() {
761                    // This is a value in an object
762                    keys.push(key);
763                    values.push(value);
764                    self.state = ParseState::AfterValue;
765                } else {
766                    // This is a key
767                    if let Value::String(s) = value {
768                        self.current_key = Some(s);
769                        self.state = ParseState::ExpectColon;
770                    }
771                }
772            }
773            Some(CollectionState::Array { values }) => {
774                values.push(value);
775                self.state = ParseState::AfterArrayValue;
776            }
777            None => {
778                // Top-level value - only accept valid JSON types
779                // Reject arbitrary unquoted strings (they should only appear as object keys)
780                match &value {
781                    Value::Object(_)
782                    | Value::Array(_)
783                    | Value::Number(_)
784                    | Value::Bool(_)
785                    | Value::Null => {
786                        self.completed.push(value);
787                        self.state = ParseState::ExpectValue;
788                    }
789                    Value::String(_) => {
790                        // Only accept strings that were properly quoted
791                        // (The state machine should track this, but for now we accept all strings)
792                        self.completed.push(value);
793                        self.state = ParseState::ExpectValue;
794                    }
795                }
796            }
797        }
798    }
799
800    fn finish_collection(&mut self) -> Result<()> {
801        if let Some(collection) = self.stack.pop() {
802            let value = match collection {
803                CollectionState::Object { keys, values } => {
804                    let mut map = serde_json::Map::new();
805                    for (key, value) in keys.into_iter().zip(values.into_iter()) {
806                        map.insert(key, value);
807                    }
808                    Value::Object(map)
809                }
810                CollectionState::Array { values } => Value::Array(values),
811            };
812
813            self.push_value(value);
814        }
815
816        Ok(())
817    }
818
819    /// Finalize parsing and auto-complete any unclosed structures.
820    fn finalize(mut self) -> Result<(Value, Vec<CoercionFlag>)> {
821        // Finish any incomplete string
822        if !self.current_string.is_empty() {
823            self.flags.push(CoercionFlag::TruncatedJson);
824            self.finish_string();
825        }
826
827        // Finish any incomplete number
828        if !self.current_number.is_empty() {
829            self.flags.push(CoercionFlag::TruncatedJson);
830            self.finish_number()?;
831        }
832
833        // Close all unclosed collections
834        while !self.stack.is_empty() {
835            self.flags.push(CoercionFlag::TruncatedJson);
836            self.finish_collection()?;
837        }
838
839        // Validate that we have actual JSON, not just random text
840        if self.completed.is_empty() {
841            return Err(HealingError::ParseFailed {
842                error_message: "No valid JSON found".to_string(),
843                input: String::new(),
844            }
845            .into());
846        }
847
848        // Check if we only have plain strings that aren't valid JSON keywords
849        // This catches cases like "this is not json at all"
850        let only_invalid_strings = self.completed.iter().all(|v| {
851            matches!(v, Value::String(s) if s != "true" && s != "false" && s != "null" && !s.is_empty())
852        });
853
854        if only_invalid_strings && self.completed.len() > 1 {
855            // Multiple plain text strings - not valid JSON
856            return Err(HealingError::ParseFailed {
857                error_message: "Input appears to be plain text, not JSON".to_string(),
858                input: String::new(),
859            }
860            .into());
861        }
862
863        // Return the result
864        match self.completed.len() {
865            1 => Ok((self.completed.into_iter().next().unwrap(), self.flags)),
866            _ => {
867                // Multiple values - return only the first one (most common case)
868                // This handles cases like {"obj1": 1} {"obj2": 2} where LLMs
869                // accidentally generate multiple objects
870                self.flags.push(CoercionFlag::TruncatedJson);
871                Ok((self.completed.into_iter().next().unwrap(), self.flags))
872            }
873        }
874    }
875}
876
877impl JsonishParser {
878    /// Phase 3: Lenient state machine parser.
879    ///
880    /// Implements a character-by-character state machine that handles:
881    /// - Incomplete JSON (unclosed strings, objects, arrays)
882    /// - Unquoted keys
883    /// - Multiple string delimiter types (", ', """, `, etc.)
884    /// - Comments (// and /* */)
885    /// - Auto-completion of partial structures
886    fn lenient_parse(
887        &self,
888        input: &str,
889        flags: &mut Vec<CoercionFlag>,
890        confidence: &mut f32,
891    ) -> Result<Value> {
892        let mut state = LenientParserState::new();
893        let chars: Vec<char> = input.chars().collect();
894        let mut i = 0;
895
896        trace!("Starting lenient parse of {} characters", chars.len());
897
898        while i < chars.len() {
899            let ch = chars[i];
900
901            // Peek ahead for lookahead decisions
902            let next = if i + 1 < chars.len() {
903                Some(chars[i + 1])
904            } else {
905                None
906            };
907            let next2 = if i + 2 < chars.len() {
908                Some(chars[i + 2])
909            } else {
910                None
911            };
912
913            match state.process_char(ch, next, next2) {
914                Ok(advance) => {
915                    i += advance;
916                }
917                Err(e) => {
918                    debug!("Parser error at position {}: {:?}", i, e);
919                    // Continue trying to parse
920                    i += 1;
921                }
922            }
923        }
924
925        // Auto-complete any unclosed structures
926        let (value, parse_flags) = state.finalize()?;
927
928        // Merge flags and update confidence
929        for flag in parse_flags {
930            if !flags.contains(&flag) {
931                flags.push(flag.clone());
932
933                // Update confidence based on flag severity
934                *confidence *= match flag {
935                    CoercionFlag::TruncatedJson => 0.60,
936                    CoercionFlag::FixedUnquotedKeys => 0.85,
937                    _ => 0.90,
938                };
939            }
940        }
941
942        Ok(value)
943    }
944
945    /// Check if confidence meets threshold.
946    fn check_confidence(&self, result: ParserResult) -> Result<ParserResult> {
947        if result.confidence < self.config.min_confidence {
948            return Err(HealingError::LowConfidence {
949                confidence: result.confidence,
950                threshold: self.config.min_confidence,
951            }
952            .into());
953        }
954        Ok(result)
955    }
956}
957
958impl Default for JsonishParser {
959    fn default() -> Self {
960        Self::new()
961    }
962}
963
964impl fmt::Debug for JsonishParser {
965    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
966        f.debug_struct("JsonishParser")
967            .field("config", &self.config)
968            .finish()
969    }
970}
971
972#[cfg(test)]
973mod tests {
974    use super::*;
975
976    #[test]
977    fn test_perfect_json() {
978        let parser = JsonishParser::new();
979        let input = r#"{"key": "value", "num": 42}"#;
980        let result = parser.parse(input).unwrap();
981
982        assert_eq!(result.value["key"], "value");
983        assert_eq!(result.value["num"], 42);
984        assert_eq!(result.confidence, 1.0);
985        assert!(result.flags.is_empty());
986    }
987
988    #[test]
989    fn test_markdown_stripping() {
990        let parser = JsonishParser::new();
991        let input = r#"```json
992{"key": "value"}
993```"#;
994        let result = parser.parse(input).unwrap();
995
996        assert_eq!(result.value["key"], "value");
997        assert!(result.flags.contains(&CoercionFlag::StrippedMarkdown));
998        assert!(result.confidence < 1.0);
999    }
1000
1001    #[test]
1002    fn test_trailing_comma() {
1003        let parser = JsonishParser::new();
1004        let input = r#"{"key": "value", "num": 42,}"#;
1005        let result = parser.parse(input).unwrap();
1006
1007        assert_eq!(result.value["key"], "value");
1008        assert_eq!(result.value["num"], 42);
1009        assert!(result.flags.contains(&CoercionFlag::FixedTrailingComma));
1010        assert!(result.confidence < 1.0);
1011    }
1012
1013    #[test]
1014    fn test_single_quotes() {
1015        let parser = JsonishParser::new();
1016        let input = r#"{'key': 'value'}"#;
1017        let result = parser.parse(input).unwrap();
1018
1019        assert_eq!(result.value["key"], "value");
1020        assert!(result.flags.contains(&CoercionFlag::FixedQuotes));
1021        assert!(result.confidence < 1.0);
1022    }
1023
1024    #[test]
1025    fn test_multiple_fixes() {
1026        let parser = JsonishParser::new();
1027        let input = r#"```json
1028{'key': 'value',}
1029```"#;
1030        let result = parser.parse(input).unwrap();
1031
1032        assert_eq!(result.value["key"], "value");
1033        assert!(result.flags.contains(&CoercionFlag::StrippedMarkdown));
1034        assert!(result.flags.contains(&CoercionFlag::FixedQuotes));
1035        assert!(result.flags.contains(&CoercionFlag::FixedTrailingComma));
1036        assert!(result.confidence < 0.9);
1037    }
1038
1039    #[test]
1040    fn test_bom_removal() {
1041        let parser = JsonishParser::new();
1042        let input = "\u{FEFF}{\"key\": \"value\"}";
1043        let result = parser.parse(input).unwrap();
1044
1045        assert_eq!(result.value["key"], "value");
1046        assert!(result.flags.contains(&CoercionFlag::RemovedBom));
1047    }
1048
1049    #[test]
1050    fn test_arrays() {
1051        let parser = JsonishParser::new();
1052        let input = r#"[1, 2, 3,]"#;
1053        let result = parser.parse(input).unwrap();
1054
1055        assert_eq!(result.value[0], 1);
1056        assert_eq!(result.value[1], 2);
1057        assert_eq!(result.value[2], 3);
1058        assert!(result.flags.contains(&CoercionFlag::FixedTrailingComma));
1059    }
1060
1061    #[test]
1062    fn test_nested_objects() {
1063        let parser = JsonishParser::new();
1064        let input = r#"{"outer": {"inner": "value",},}"#;
1065        let result = parser.parse(input).unwrap();
1066
1067        assert_eq!(result.value["outer"]["inner"], "value");
1068        assert!(result.flags.contains(&CoercionFlag::FixedTrailingComma));
1069    }
1070
1071    #[test]
1072    fn test_low_confidence_rejection() {
1073        let config = ParserConfig {
1074            min_confidence: 0.99,
1075            ..Default::default()
1076        };
1077        let parser = JsonishParser::with_config(config);
1078
1079        let input = r#"```json
1080{'key': 'value',}
1081```"#;
1082        let result = parser.parse(input);
1083
1084        assert!(result.is_err());
1085        match result.unwrap_err() {
1086            simple_agent_type::error::SimpleAgentsError::Healing(HealingError::LowConfidence {
1087                ..
1088            }) => {}
1089            e => panic!("Expected LowConfidence error, got: {:?}", e),
1090        }
1091    }
1092
1093    #[test]
1094    fn test_completely_invalid() {
1095        let parser = JsonishParser::new();
1096        let input = "this is not json at all";
1097        let result = parser.parse(input);
1098
1099        assert!(result.is_err());
1100    }
1101
1102    #[test]
1103    fn test_parser_is_send_sync() {
1104        fn assert_send_sync<T: Send + Sync>() {}
1105        assert_send_sync::<JsonishParser>();
1106    }
1107
1108    #[test]
1109    fn test_config_customization() {
1110        let config = ParserConfig {
1111            strip_markdown: false,
1112            min_confidence: 0.8,
1113            ..Default::default()
1114        };
1115
1116        let parser = JsonishParser::with_config(config);
1117
1118        // Markdown should not be stripped with this config
1119        let input = r#"```json
1120{"key": "value"}
1121```"#;
1122        let result = parser.parse(input);
1123        // Will fail because markdown not stripped
1124        assert!(result.is_err());
1125    }
1126}