fuzzy-parser

Automatic JSON repair for LLM-generated output

Overview

LLM-generated JSON often contains typos and syntax errors. fuzzy-parser automatically repairs these issues, enabling robust LLM integration.

use fuzzy_parser::{sanitize_json, repair_tagged_enum_json, TaggedEnumSchema, FuzzyOptions};

// LLM output (typos + syntax errors)
let llm_output = r#"{"type": "AddDeriv", "taget": "User", "derives": ["Debg",],}"#;

// Step 1: Fix syntax errors
let sanitized = sanitize_json(llm_output);

// Step 2: Fix typos
let schema = TaggedEnumSchema::new("type", &["AddDerive"], |_| Some(&["target", "derives"][..]))
    .with_enum_array("derives", &["Debug", "Clone", "Serialize"]);

let result = repair_tagged_enum_json(&sanitized, &schema, &FuzzyOptions::default())?;

assert_eq!(result.repaired["type"], "AddDerive");      // AddDeriv → AddDerive
assert_eq!(result.repaired["target"], "User");          // taget → target
assert_eq!(result.repaired["derives"][0], "Debug");     // Debg → Debug

Features

JSON Sanitization (Syntax Repair)

Error	Before	After
Trailing comma (object)	`{"a": 1,}`	`{"a": 1}`
Trailing comma (array)	`[1, 2,]`	`[1, 2]`
Missing closing brace	`{"a": 1`	`{"a": 1}`
Missing closing bracket	`["a"`	`["a"]`
Unclosed string	`{"a": "test`	`{"a": "test"}`

Fuzzy Repair (Typo Correction)

Target	Before	After
Tag value (enum discriminator)	`"AddDeriv"`	`"AddDerive"`
Field name	`"taget"`	`"target"`
Enum array value	`["Debg"]`	`["Debug"]`
Nested object field	`{"timout": 30}`	`{"timeout": 30}`

Installation

[dependencies]
fuzzy-parser = "0.1"

Usage

Basic Usage

use fuzzy_parser::{sanitize_json, repair_tagged_enum_json, TaggedEnumSchema, FuzzyOptions};

// Define schema
let schema = TaggedEnumSchema::new(
    "type",                                    // tag field name
    &["AddDerive", "RemoveDerive", "Rename"],  // valid tag values
    |tag| match tag {
        "AddDerive" | "RemoveDerive" => Some(&["target", "derives"][..]),
        "Rename" => Some(&["from", "to"][..]),
        _ => None,
    },
);

// Repair
let json = r#"{"type": "AddDeriv", "taget": "User"}"#;
let result = repair_tagged_enum_json(json, &schema, &FuzzyOptions::default())?;

println!("Repaired: {}", result.repaired);
println!("Corrections: {:?}", result.corrections);

Enum Array Repair

let schema = TaggedEnumSchema::new("type", &["AddDerive"], |_| Some(&["target", "derives"][..]))
    .with_enum_array("derives", &["Debug", "Clone", "Serialize", "Default"]);

let json = r#"{"type": "AddDerive", "target": "User", "derives": ["Debg", "Clne"]}"#;
let result = repair_tagged_enum_json(json, &schema, &FuzzyOptions::default())?;

// derives: ["Debug", "Clone"]

Nested Object Repair

let schema = TaggedEnumSchema::new("type", &["Configure"], |_| Some(&["name", "config"][..]))
    .with_nested_object("config", &["timeout", "retries", "enabled"]);

let json = r#"{"type": "Configure", "name": "api", "config": {"timout": 30, "retres": 3}}"#;
let result = repair_tagged_enum_json(json, &schema, &FuzzyOptions::default())?;

// config: {"timeout": 30, "retries": 3}

Combined Sanitization + Repair

use fuzzy_parser::{sanitize_json, repair_tagged_enum_json, TaggedEnumSchema, FuzzyOptions};

let schema = TaggedEnumSchema::new("type", &["Action"], |_| Some(&["name", "data"][..]))
    .with_nested_object("data", &["value", "count"]);

// LLM output (syntax errors + typos)
let malformed = r#"{
    "type": "Acton",
    "nam": "test",
    "data": {"valeu": 42,},
}"#;

// Step 1: Sanitize (fix trailing commas, missing braces, etc.)
let sanitized = sanitize_json(malformed);

// Step 2: Repair (fix typos)
let result = repair_tagged_enum_json(&sanitized, &schema, &FuzzyOptions::default())?;

Custom Options

use fuzzy_parser::{FuzzyOptions, Algorithm};

// Customize similarity threshold and algorithm
let options = FuzzyOptions::default()
    .with_min_similarity(0.8)                 // default: 0.7
    .with_algorithm(Algorithm::Levenshtein);  // default: JaroWinkler

Inspecting Corrections

let result = repair_tagged_enum_json(json, &schema, &options)?;

if result.has_corrections() {
    println!("{} corrections made:", result.correction_count());
    for correction in &result.corrections {
        println!(
            "  {} → {} (similarity: {:.2}, path: {})",
            correction.original,
            correction.corrected,
            correction.similarity,
            correction.field_path
        );
    }
}

Algorithms

Algorithm	Characteristics	Best For
Jaro-Winkler (default)	Prefix-weighted, handles transpositions	General typo correction
Levenshtein	Equal cost for insert/delete/substitute	Edit distance based
Damerau-Levenshtein	Levenshtein + transposition support	Transposition-heavy typos

Design Principles

Two-stage processing: Syntax repair (sanitize) and typo repair (repair) are separated
Schema-driven: Caller defines the schema (library remains generic)
Transparency: All corrections are recorded as Correction structs
Safety: No corrections made below similarity threshold

License

MIT OR Apache-2.0

fuzzy-parser 0.1.0