fuzzy_parser/
lib.rs

1//! Fuzzy JSON repair for LLM-generated DSL
2//!
3//! This crate provides generic fuzzy matching and automatic correction for
4//! JSON that may contain typos (common when generated by LLMs).
5//!
6//! # Design
7//!
8//! This crate provides **generic repair APIs** - the schema definitions
9//! live in the calling crate (e.g., your application defines the schema).
10//!
11//! # Features
12//!
13//! - **JSON sanitization**: Fix syntax errors (trailing commas, missing braces)
14//! - **Tagged enum repair**: Fix type discriminator typos (e.g., `"AddDeriv"` → `"AddDerive"`)
15//! - **Field name repair**: Fix field name typos (e.g., `"taget"` → `"target"`)
16//! - **Enum array repair**: Fix values in enum arrays (e.g., `["Debg"]` → `["Debug"]`)
17//! - **Nested object repair**: Fix field names in nested objects
18//! - **Multiple algorithm support**: Jaro-Winkler, Levenshtein, Damerau-Levenshtein
19//! - **Configurable similarity threshold**
20//!
21//! # Example
22//!
23//! ```
24//! use fuzzy_parser::{
25//!     sanitize_json, repair_tagged_enum_json, TaggedEnumSchema, FuzzyOptions,
26//! };
27//!
28//! // Define schema with enum arrays and nested objects
29//! let schema = TaggedEnumSchema::new(
30//!     "type",  // tag field
31//!     &["AddDerive", "RemoveDerive"],  // valid types
32//!     |tag| match tag {
33//!         "AddDerive" | "RemoveDerive" => Some(&["target", "derives", "config"][..]),
34//!         _ => None,
35//!     },
36//! )
37//! .with_enum_array("derives", &["Debug", "Clone", "Serialize", "Default"])
38//! .with_nested_object("config", &["timeout", "retries"]);
39//!
40//! // LLM output with syntax errors AND typos
41//! let malformed = r#"{"type": "AddDeriv", "taget": "User", "derives": ["Debg",], "config": {"timout": 30,}}"#;
42//!
43//! // Step 1: Sanitize (fix syntax errors)
44//! let sanitized = sanitize_json(malformed);
45//!
46//! // Step 2: Repair (fix typos)
47//! let result = repair_tagged_enum_json(&sanitized, &schema, &FuzzyOptions::default()).unwrap();
48//!
49//! assert_eq!(result.repaired["type"], "AddDerive");
50//! assert!(result.repaired.get("target").is_some());
51//! assert_eq!(result.repaired["derives"][0], "Debug");
52//! assert!(result.repaired["config"].get("timeout").is_some());
53//! ```
54
55pub mod distance;
56pub mod error;
57pub mod repair;
58pub mod sanitize;
59pub mod schema;
60
61// Re-export main types
62pub use distance::{Algorithm, Match};
63pub use error::FuzzyError;
64pub use repair::{
65    repair_enum_array, repair_fields_with_list, repair_object_fields, repair_tagged_enum,
66    repair_tagged_enum_array, repair_tagged_enum_json, Correction, FuzzyOptions, RepairResult,
67};
68pub use sanitize::sanitize_json;
69pub use schema::{ObjectSchema, TaggedEnumSchema};
70
71#[cfg(test)]
72mod tests {
73    use super::*;
74
75    #[test]
76    fn test_full_workflow() {
77        // Define a simple schema
78        let schema =
79            TaggedEnumSchema::new("type", &["AddDerive", "RenameIdent"], |tag| match tag {
80                "AddDerive" => Some(&["target", "derives"][..]),
81                "RenameIdent" => Some(&["from", "to", "kind"][..]),
82                _ => None,
83            });
84
85        // Simulate LLM output with typos
86        let llm_output = r#"{
87            "type": "AddDeriv",
88            "taget": "DatabaseConfig",
89            "derives": ["Debug", "Clone"]
90        }"#;
91
92        let result =
93            repair_tagged_enum_json(llm_output, &schema, &FuzzyOptions::default()).unwrap();
94
95        // Verify corrections
96        assert_eq!(result.repaired["type"], "AddDerive");
97        assert_eq!(result.repaired["target"], "DatabaseConfig");
98        assert_eq!(result.corrections.len(), 2);
99    }
100
101    #[test]
102    fn test_sanitize_then_repair_workflow() {
103        // Define schema with enum arrays
104        let schema =
105            TaggedEnumSchema::new("type", &["AddDerive"], |_| Some(&["target", "derives"][..]))
106                .with_enum_array("derives", &["Debug", "Clone", "Serialize"]);
107
108        // LLM output with BOTH syntax errors AND typos
109        let malformed_llm_output = r#"{
110            "type": "AddDeriv",
111            "taget": "User",
112            "derives": ["Debg", "Clne",],
113        }"#;
114
115        // Step 1: Sanitize
116        let sanitized = sanitize_json(malformed_llm_output);
117
118        // Verify sanitization worked
119        assert!(!sanitized.contains(",]"));
120        assert!(!sanitized.contains(",}"));
121
122        // Step 2: Repair
123        let result =
124            repair_tagged_enum_json(&sanitized, &schema, &FuzzyOptions::default()).unwrap();
125
126        // Verify all corrections
127        assert_eq!(result.repaired["type"], "AddDerive");
128        assert!(result.repaired.get("target").is_some());
129        assert_eq!(result.repaired["derives"][0], "Debug");
130        assert_eq!(result.repaired["derives"][1], "Clone");
131    }
132
133    #[test]
134    fn test_sanitize_then_repair_missing_brace() {
135        let schema = TaggedEnumSchema::new("type", &["Action"], |_| Some(&["name"][..]));
136
137        // LLM truncated output
138        let truncated = r#"{"type": "Action", "name": "test"#;
139
140        let sanitized = sanitize_json(truncated);
141        let result =
142            repair_tagged_enum_json(&sanitized, &schema, &FuzzyOptions::default()).unwrap();
143
144        assert_eq!(result.repaired["type"], "Action");
145        assert_eq!(result.repaired["name"], "test");
146    }
147}