fuzzy_parser/
repair.rs

1//! Generic JSON repair logic
2//!
3//! This module provides generic fuzzy repair functions that work with
4//! any schema provided by the caller.
5
6use crate::distance::{find_closest, Algorithm};
7use crate::error::FuzzyError;
8use crate::schema::{ObjectSchema, TaggedEnumSchema};
9use serde_json::{Map, Value};
10
11/// Options for fuzzy repair
12#[derive(Debug, Clone)]
13pub struct FuzzyOptions {
14    /// Minimum similarity threshold (0.0 to 1.0)
15    ///
16    /// Values below this threshold will not be corrected.
17    /// Default: 0.7
18    pub min_similarity: f64,
19
20    /// Algorithm to use for similarity calculation
21    ///
22    /// Default: JaroWinkler (best for typos)
23    pub algorithm: Algorithm,
24}
25
26impl Default for FuzzyOptions {
27    fn default() -> Self {
28        Self {
29            min_similarity: 0.7,
30            algorithm: Algorithm::JaroWinkler,
31        }
32    }
33}
34
35impl FuzzyOptions {
36    /// Create options with a custom minimum similarity threshold
37    pub fn with_min_similarity(mut self, min_similarity: f64) -> Self {
38        self.min_similarity = min_similarity;
39        self
40    }
41
42    /// Create options with a custom algorithm
43    pub fn with_algorithm(mut self, algorithm: Algorithm) -> Self {
44        self.algorithm = algorithm;
45        self
46    }
47}
48
49/// A single correction made during repair
50#[derive(Debug, Clone, PartialEq)]
51pub struct Correction {
52    /// The original (incorrect) value
53    pub original: String,
54    /// The corrected value
55    pub corrected: String,
56    /// Similarity score (0.0 to 1.0)
57    pub similarity: f64,
58    /// JSON path to the corrected field (e.g., "$.type", "$.target")
59    pub field_path: String,
60}
61
62impl Correction {
63    /// Create a new correction
64    pub fn new(original: String, corrected: String, similarity: f64, field_path: String) -> Self {
65        Self {
66            original,
67            corrected,
68            similarity,
69            field_path,
70        }
71    }
72}
73
74/// Result of a repair operation
75#[derive(Debug, Clone)]
76pub struct RepairResult {
77    /// The repaired JSON value
78    pub repaired: Value,
79    /// List of corrections made
80    pub corrections: Vec<Correction>,
81}
82
83impl RepairResult {
84    /// Check if any corrections were made
85    pub fn has_corrections(&self) -> bool {
86        !self.corrections.is_empty()
87    }
88
89    /// Get the number of corrections made
90    pub fn correction_count(&self) -> usize {
91        self.corrections.len()
92    }
93}
94
95// ============================================================================
96// Generic Repair Functions
97// ============================================================================
98
99/// Repair field names in a JSON object using an ObjectSchema
100///
101/// Returns the list of corrections made.
102pub fn repair_object_fields(
103    obj: &mut Map<String, Value>,
104    schema: &ObjectSchema,
105    path: &str,
106    options: &FuzzyOptions,
107) -> Vec<Correction> {
108    repair_fields_with_list(obj, schema.valid_fields, path, options)
109}
110
111/// Repair field names in a JSON object using a field list
112///
113/// Returns the list of corrections made.
114pub fn repair_fields_with_list(
115    obj: &mut Map<String, Value>,
116    valid_fields: &[&str],
117    path: &str,
118    options: &FuzzyOptions,
119) -> Vec<Correction> {
120    let mut corrections = Vec::new();
121
122    // Collect keys that need correction
123    let keys_to_check: Vec<String> = obj
124        .keys()
125        .filter(|k| !valid_fields.contains(&k.as_str()))
126        .cloned()
127        .collect();
128
129    // Process each invalid key
130    for key in keys_to_check {
131        if let Some(m) = find_closest(
132            &key,
133            valid_fields.iter().copied(),
134            options.min_similarity,
135            options.algorithm,
136        ) {
137            // Only correct if the target field doesn't already exist
138            if !obj.contains_key(&m.candidate) {
139                if let Some(val) = obj.remove(&key) {
140                    corrections.push(Correction::new(
141                        key.clone(),
142                        m.candidate.clone(),
143                        m.similarity,
144                        format!("{}.{}", path, key),
145                    ));
146                    obj.insert(m.candidate, val);
147                }
148            }
149        }
150    }
151
152    corrections
153}
154
155/// Repair a tagged enum JSON object using a TaggedEnumSchema
156///
157/// This repairs:
158/// 1. The tag field value (e.g., "AddDeriv" -> "AddDerive")
159/// 2. The field names based on the tag value
160/// 3. Values in enum array fields (e.g., ["Debg"] -> ["Debug"])
161/// 4. Field names in nested objects
162///
163/// Returns the list of corrections made.
164pub fn repair_tagged_enum<F>(
165    obj: &mut Map<String, Value>,
166    schema: &TaggedEnumSchema<F>,
167    path: &str,
168    options: &FuzzyOptions,
169) -> Vec<Correction>
170where
171    F: Fn(&str) -> Option<&'static [&'static str]>,
172{
173    let mut corrections = Vec::new();
174
175    // Step 1: Repair tag field value
176    let tag_value = if let Some(tag_val) = obj.get(schema.tag_field).and_then(|v| v.as_str()) {
177        if !schema.is_valid_tag(tag_val) {
178            // Try to find closest match
179            if let Some(m) = find_closest(
180                tag_val,
181                schema.valid_tags.iter().copied(),
182                options.min_similarity,
183                options.algorithm,
184            ) {
185                corrections.push(Correction::new(
186                    tag_val.to_string(),
187                    m.candidate.clone(),
188                    m.similarity,
189                    format!("{}.{}", path, schema.tag_field),
190                ));
191                obj.insert(
192                    schema.tag_field.to_string(),
193                    Value::String(m.candidate.clone()),
194                );
195                m.candidate
196            } else {
197                tag_val.to_string()
198            }
199        } else {
200            tag_val.to_string()
201        }
202    } else {
203        return corrections; // No tag field, can't repair fields
204    };
205
206    // Step 2: Repair field names based on tag value
207    if let Some(valid_fields) = schema.get_fields(&tag_value) {
208        // Filter out the tag field itself from the check
209        let keys_to_check: Vec<String> = obj
210            .keys()
211            .filter(|k| *k != schema.tag_field && !valid_fields.contains(&k.as_str()))
212            .cloned()
213            .collect();
214
215        for key in keys_to_check {
216            if let Some(m) = find_closest(
217                &key,
218                valid_fields.iter().copied(),
219                options.min_similarity,
220                options.algorithm,
221            ) {
222                if !obj.contains_key(&m.candidate) {
223                    if let Some(val) = obj.remove(&key) {
224                        corrections.push(Correction::new(
225                            key.clone(),
226                            m.candidate.clone(),
227                            m.similarity,
228                            format!("{}.{}", path, key),
229                        ));
230                        obj.insert(m.candidate, val);
231                    }
232                }
233            }
234        }
235    }
236
237    // Step 3: Repair enum array values
238    for (field_name, valid_values) in &schema.enum_arrays {
239        if let Some(Value::Array(arr)) = obj.get_mut(*field_name) {
240            let field_path = format!("{}.{}", path, field_name);
241            let arr_corrections = repair_enum_array(arr, valid_values, &field_path, options);
242            corrections.extend(arr_corrections);
243        }
244    }
245
246    // Step 4: Repair nested object fields
247    for (field_name, valid_fields) in &schema.nested_objects {
248        if let Some(Value::Object(nested_obj)) = obj.get_mut(*field_name) {
249            let nested_path = format!("{}.{}", path, field_name);
250            let nested_corrections =
251                repair_fields_with_list(nested_obj, valid_fields, &nested_path, options);
252            corrections.extend(nested_corrections);
253        }
254    }
255
256    corrections
257}
258
259/// Repair values in an enum array
260///
261/// Each string value in the array is fuzzy-matched against `valid_values`.
262pub fn repair_enum_array(
263    arr: &mut [Value],
264    valid_values: &[&str],
265    path: &str,
266    options: &FuzzyOptions,
267) -> Vec<Correction> {
268    let mut corrections = Vec::new();
269
270    for (i, item) in arr.iter_mut().enumerate() {
271        if let Value::String(s) = item {
272            if !valid_values.contains(&s.as_str()) {
273                if let Some(m) = find_closest(
274                    s,
275                    valid_values.iter().copied(),
276                    options.min_similarity,
277                    options.algorithm,
278                ) {
279                    corrections.push(Correction::new(
280                        s.clone(),
281                        m.candidate.clone(),
282                        m.similarity,
283                        format!("{}[{}]", path, i),
284                    ));
285                    *item = Value::String(m.candidate);
286                }
287            }
288        }
289    }
290
291    corrections
292}
293
294/// Repair a tagged enum from JSON string
295pub fn repair_tagged_enum_json<F>(
296    json: &str,
297    schema: &TaggedEnumSchema<F>,
298    options: &FuzzyOptions,
299) -> Result<RepairResult, FuzzyError>
300where
301    F: Fn(&str) -> Option<&'static [&'static str]>,
302{
303    let mut value: Value = serde_json::from_str(json)?;
304
305    let corrections = if let Some(obj) = value.as_object_mut() {
306        repair_tagged_enum(obj, schema, "$", options)
307    } else {
308        return Err(FuzzyError::NotObject);
309    };
310
311    Ok(RepairResult {
312        repaired: value,
313        corrections,
314    })
315}
316
317/// Repair an array of tagged enums
318pub fn repair_tagged_enum_array<F>(
319    arr: &mut [Value],
320    schema: &TaggedEnumSchema<F>,
321    path: &str,
322    options: &FuzzyOptions,
323) -> Vec<Correction>
324where
325    F: Fn(&str) -> Option<&'static [&'static str]>,
326{
327    let mut all_corrections = Vec::new();
328
329    for (i, item) in arr.iter_mut().enumerate() {
330        if let Some(obj) = item.as_object_mut() {
331            let item_path = format!("{}[{}]", path, i);
332            let corrections = repair_tagged_enum(obj, schema, &item_path, options);
333            all_corrections.extend(corrections);
334        }
335    }
336
337    all_corrections
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343
344    fn test_schema() -> TaggedEnumSchema<fn(&str) -> Option<&'static [&'static str]>> {
345        TaggedEnumSchema::new(
346            "type",
347            &["AddDerive", "RemoveDerive", "RenameIdent"],
348            |tag| match tag {
349                "AddDerive" | "RemoveDerive" => Some(&["target", "derives"]),
350                "RenameIdent" => Some(&["from", "to", "kind"]),
351                _ => None,
352            },
353        )
354    }
355
356    #[test]
357    fn test_repair_tagged_enum_type_typo() {
358        let schema = test_schema();
359        let json = r#"{"type": "AddDeriv", "target": "User", "derives": ["Debug"]}"#;
360        let options = FuzzyOptions::default();
361
362        let result = repair_tagged_enum_json(json, &schema, &options).unwrap();
363
364        assert_eq!(result.repaired["type"], "AddDerive");
365        assert_eq!(result.corrections.len(), 1);
366        assert_eq!(result.corrections[0].original, "AddDeriv");
367        assert_eq!(result.corrections[0].corrected, "AddDerive");
368    }
369
370    #[test]
371    fn test_repair_tagged_enum_field_typo() {
372        let schema = test_schema();
373        let json = r#"{"type": "AddDerive", "taget": "User", "derives": ["Debug"]}"#;
374        let options = FuzzyOptions::default();
375
376        let result = repair_tagged_enum_json(json, &schema, &options).unwrap();
377
378        assert!(result.repaired.get("target").is_some());
379        assert!(result.repaired.get("taget").is_none());
380        assert_eq!(result.corrections.len(), 1);
381    }
382
383    #[test]
384    fn test_repair_tagged_enum_multiple_typos() {
385        let schema = test_schema();
386        let json = r#"{"type": "RenamIdent", "form": "old", "too": "new"}"#;
387        let options = FuzzyOptions::default();
388
389        let result = repair_tagged_enum_json(json, &schema, &options).unwrap();
390
391        assert_eq!(result.repaired["type"], "RenameIdent");
392        assert!(result.repaired.get("from").is_some());
393        assert!(result.repaired.get("to").is_some());
394        assert_eq!(result.corrections.len(), 3);
395    }
396
397    #[test]
398    fn test_repair_object_fields() {
399        let schema = ObjectSchema::new(&["name", "module", "derives"]);
400        let mut obj: Map<String, Value> =
401            serde_json::from_str(r#"{"nam": "Test", "modul": "foo"}"#).unwrap();
402        let options = FuzzyOptions::default();
403
404        let corrections = repair_object_fields(&mut obj, &schema, "$", &options);
405
406        assert!(obj.contains_key("name"));
407        assert!(obj.contains_key("module"));
408        assert_eq!(corrections.len(), 2);
409    }
410
411    #[test]
412    fn test_no_correction_needed() {
413        let schema = test_schema();
414        let json = r#"{"type": "AddDerive", "target": "User", "derives": ["Debug"]}"#;
415        let options = FuzzyOptions::default();
416
417        let result = repair_tagged_enum_json(json, &schema, &options).unwrap();
418
419        assert!(!result.has_corrections());
420    }
421
422    #[test]
423    fn test_high_similarity_threshold() {
424        let schema = test_schema();
425        let json = r#"{"type": "AddDeriv", "target": "User", "derives": ["Debug"]}"#;
426        let options = FuzzyOptions::default().with_min_similarity(0.99);
427
428        let result = repair_tagged_enum_json(json, &schema, &options).unwrap();
429
430        // With very high threshold, typo should not be corrected
431        assert_eq!(result.repaired["type"], "AddDeriv");
432        assert!(!result.has_corrections());
433    }
434
435    #[test]
436    fn test_repair_array() {
437        let schema = test_schema();
438        let mut arr: Vec<Value> = serde_json::from_str(
439            r#"[
440                {"type": "AddDeriv", "taget": "User", "derives": ["Debug"]},
441                {"type": "RenamIdent", "form": "old", "too": "new"}
442            ]"#,
443        )
444        .unwrap();
445        let options = FuzzyOptions::default();
446
447        let corrections = repair_tagged_enum_array(&mut arr, &schema, "$.intents", &options);
448
449        assert_eq!(arr[0]["type"], "AddDerive");
450        assert!(arr[0].get("target").is_some());
451        assert_eq!(arr[1]["type"], "RenameIdent");
452        assert!(arr[1].get("from").is_some());
453        assert!(corrections.len() >= 4);
454    }
455
456    #[test]
457    fn test_repair_enum_array_values() {
458        let schema =
459            TaggedEnumSchema::new("type", &["AddDerive"], |_| Some(&["target", "derives"][..]))
460                .with_enum_array("derives", &["Debug", "Clone", "Serialize", "Default"]);
461
462        let json =
463            r#"{"type": "AddDerive", "target": "User", "derives": ["Debg", "Clne", "Serializ"]}"#;
464        let options = FuzzyOptions::default();
465
466        let result = repair_tagged_enum_json(json, &schema, &options).unwrap();
467
468        assert_eq!(result.repaired["derives"][0], "Debug");
469        assert_eq!(result.repaired["derives"][1], "Clone");
470        assert_eq!(result.repaired["derives"][2], "Serialize");
471        assert_eq!(result.corrections.len(), 3);
472    }
473
474    #[test]
475    fn test_repair_nested_object_fields() {
476        let schema =
477            TaggedEnumSchema::new("type", &["Configure"], |_| Some(&["name", "config"][..]))
478                .with_nested_object("config", &["timeout", "retries", "enabled"]);
479
480        let json =
481            r#"{"type": "Configure", "name": "test", "config": {"timout": 30, "retres": 3}}"#;
482        let options = FuzzyOptions::default();
483
484        let result = repair_tagged_enum_json(json, &schema, &options).unwrap();
485
486        assert!(result.repaired["config"].get("timeout").is_some());
487        assert!(result.repaired["config"].get("retries").is_some());
488        assert_eq!(result.repaired["config"]["timeout"], 30);
489        assert_eq!(result.repaired["config"]["retries"], 3);
490        assert_eq!(result.corrections.len(), 2);
491    }
492
493    #[test]
494    fn test_repair_combined_all_features() {
495        let schema = TaggedEnumSchema::new("type", &["AddDerive"], |_| {
496            Some(&["target", "derives", "config"][..])
497        })
498        .with_enum_array("derives", &["Debug", "Clone", "Serialize"])
499        .with_nested_object("config", &["timeout", "retries"]);
500
501        let json = r#"{
502            "type": "AddDeriv",
503            "taget": "User",
504            "derives": ["Debg", "Clne"],
505            "config": {"timout": 30}
506        }"#;
507        let options = FuzzyOptions::default();
508
509        let result = repair_tagged_enum_json(json, &schema, &options).unwrap();
510
511        // Tag value repaired
512        assert_eq!(result.repaired["type"], "AddDerive");
513        // Field name repaired
514        assert!(result.repaired.get("target").is_some());
515        assert_eq!(result.repaired["target"], "User");
516        // Enum array values repaired
517        assert_eq!(result.repaired["derives"][0], "Debug");
518        assert_eq!(result.repaired["derives"][1], "Clone");
519        // Nested object field repaired
520        assert!(result.repaired["config"].get("timeout").is_some());
521        assert_eq!(result.repaired["config"]["timeout"], 30);
522        // Total corrections: type + target + 2 derives + timeout = 5
523        assert_eq!(result.corrections.len(), 5);
524    }
525
526    #[test]
527    fn test_repair_enum_array_no_correction_needed() {
528        let schema =
529            TaggedEnumSchema::new("type", &["AddDerive"], |_| Some(&["target", "derives"][..]))
530                .with_enum_array("derives", &["Debug", "Clone"]);
531
532        let json = r#"{"type": "AddDerive", "target": "User", "derives": ["Debug", "Clone"]}"#;
533        let options = FuzzyOptions::default();
534
535        let result = repair_tagged_enum_json(json, &schema, &options).unwrap();
536
537        assert!(!result.has_corrections());
538    }
539}