fuzzy_parser/lib.rs
1//! Fuzzy JSON repair for LLM-generated DSL
2//!
3//! This crate provides generic fuzzy matching and automatic correction for
4//! JSON that may contain typos (common when generated by LLMs).
5//!
6//! # Design
7//!
8//! This crate provides **generic repair APIs** - the schema definitions
9//! live in the calling crate (e.g., your application defines the schema).
10//!
11//! # Features
12//!
13//! - **JSON sanitization**: Fix syntax errors (trailing commas, missing braces)
14//! - **Tagged enum repair**: Fix type discriminator typos (e.g., `"AddDeriv"` → `"AddDerive"`)
15//! - **Field name repair**: Fix field name typos (e.g., `"taget"` → `"target"`)
16//! - **Enum array repair**: Fix values in enum arrays (e.g., `["Debg"]` → `["Debug"]`)
17//! - **Nested object repair**: Fix field names in nested objects
18//! - **Multiple algorithm support**: Jaro-Winkler, Levenshtein, Damerau-Levenshtein
19//! - **Configurable similarity threshold**
20//!
21//! # Example
22//!
23//! ```
24//! use fuzzy_parser::{
25//! sanitize_json, repair_tagged_enum_json, TaggedEnumSchema, FuzzyOptions,
26//! };
27//!
28//! // Define schema with enum arrays and nested objects
29//! let schema = TaggedEnumSchema::new(
30//! "type", // tag field
31//! &["AddDerive", "RemoveDerive"], // valid types
32//! |tag| match tag {
33//! "AddDerive" | "RemoveDerive" => Some(&["target", "derives", "config"][..]),
34//! _ => None,
35//! },
36//! )
37//! .with_enum_array("derives", &["Debug", "Clone", "Serialize", "Default"])
38//! .with_nested_object("config", &["timeout", "retries"]);
39//!
40//! // LLM output with syntax errors AND typos
41//! let malformed = r#"{"type": "AddDeriv", "taget": "User", "derives": ["Debg",], "config": {"timout": 30,}}"#;
42//!
43//! // Step 1: Sanitize (fix syntax errors)
44//! let sanitized = sanitize_json(malformed);
45//!
46//! // Step 2: Repair (fix typos)
47//! let result = repair_tagged_enum_json(&sanitized, &schema, &FuzzyOptions::default()).unwrap();
48//!
49//! assert_eq!(result.repaired["type"], "AddDerive");
50//! assert!(result.repaired.get("target").is_some());
51//! assert_eq!(result.repaired["derives"][0], "Debug");
52//! assert!(result.repaired["config"].get("timeout").is_some());
53//! ```
54
55pub mod distance;
56pub mod error;
57pub mod repair;
58pub mod sanitize;
59pub mod schema;
60
61// Re-export main types
62pub use distance::{Algorithm, Match};
63pub use error::FuzzyError;
64pub use repair::{
65 repair_enum_array, repair_fields_with_list, repair_object_fields, repair_tagged_enum,
66 repair_tagged_enum_array, repair_tagged_enum_json, Correction, FuzzyOptions, RepairResult,
67};
68pub use sanitize::sanitize_json;
69pub use schema::{ObjectSchema, TaggedEnumSchema};
70
71#[cfg(test)]
72mod tests {
73 use super::*;
74
75 #[test]
76 fn test_full_workflow() {
77 // Define a simple schema
78 let schema =
79 TaggedEnumSchema::new("type", &["AddDerive", "RenameIdent"], |tag| match tag {
80 "AddDerive" => Some(&["target", "derives"][..]),
81 "RenameIdent" => Some(&["from", "to", "kind"][..]),
82 _ => None,
83 });
84
85 // Simulate LLM output with typos
86 let llm_output = r#"{
87 "type": "AddDeriv",
88 "taget": "DatabaseConfig",
89 "derives": ["Debug", "Clone"]
90 }"#;
91
92 let result =
93 repair_tagged_enum_json(llm_output, &schema, &FuzzyOptions::default()).unwrap();
94
95 // Verify corrections
96 assert_eq!(result.repaired["type"], "AddDerive");
97 assert_eq!(result.repaired["target"], "DatabaseConfig");
98 assert_eq!(result.corrections.len(), 2);
99 }
100
101 #[test]
102 fn test_sanitize_then_repair_workflow() {
103 // Define schema with enum arrays
104 let schema =
105 TaggedEnumSchema::new("type", &["AddDerive"], |_| Some(&["target", "derives"][..]))
106 .with_enum_array("derives", &["Debug", "Clone", "Serialize"]);
107
108 // LLM output with BOTH syntax errors AND typos
109 let malformed_llm_output = r#"{
110 "type": "AddDeriv",
111 "taget": "User",
112 "derives": ["Debg", "Clne",],
113 }"#;
114
115 // Step 1: Sanitize
116 let sanitized = sanitize_json(malformed_llm_output);
117
118 // Verify sanitization worked
119 assert!(!sanitized.contains(",]"));
120 assert!(!sanitized.contains(",}"));
121
122 // Step 2: Repair
123 let result =
124 repair_tagged_enum_json(&sanitized, &schema, &FuzzyOptions::default()).unwrap();
125
126 // Verify all corrections
127 assert_eq!(result.repaired["type"], "AddDerive");
128 assert!(result.repaired.get("target").is_some());
129 assert_eq!(result.repaired["derives"][0], "Debug");
130 assert_eq!(result.repaired["derives"][1], "Clone");
131 }
132
133 #[test]
134 fn test_sanitize_then_repair_missing_brace() {
135 let schema = TaggedEnumSchema::new("type", &["Action"], |_| Some(&["name"][..]));
136
137 // LLM truncated output
138 let truncated = r#"{"type": "Action", "name": "test"#;
139
140 let sanitized = sanitize_json(truncated);
141 let result =
142 repair_tagged_enum_json(&sanitized, &schema, &FuzzyOptions::default()).unwrap();
143
144 assert_eq!(result.repaired["type"], "Action");
145 assert_eq!(result.repaired["name"], "test");
146 }
147}